NTT123
add fast cpp wavegru
d1a84ee
raw
history blame
35.4 kB
// Copyright 2021 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <array>
#include <cstdint>
#include <tuple>
#include <vector>
// Placeholder for get runfiles header.
#include "absl/status/status.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/string_view.h"
#include "absl/types/span.h"
#include "gtest/gtest.h"
#include "include/ghc/filesystem.hpp"
#include "sparse_matmul/compute/matmul.h"
#include "sparse_matmul/layers/utils.h"
#include "sparse_matmul/numerics/test_utils.h"
#include "sparse_matmul/os/coop_threads.h"
namespace csrblocksparse {
namespace {
inline constexpr absl::string_view kTestdataPath = "layers/testdata";
TEST(CSRBlockSparseMatrix, FlatBufferSerialization) {
const int kRows = 8;
const int kCols = 8;
std::vector<int> mask = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1};
std::vector<float> values(kRows * kCols, 1.f);
values[1] = 2.f;
values[3] = 3.f;
values[36] = -1.f;
values[45] = -2.f;
csrblocksparse::CacheAlignedVector<float> bias(kRows);
csrblocksparse::CacheAlignedVector<float> rhs(kCols);
csrblocksparse::CacheAlignedVector<float> out_ref(kRows);
csrblocksparse::CacheAlignedVector<float> out_test(kRows);
bias.FillZero();
rhs.FillOnes();
csrblocksparse::MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(),
values.data());
matrix.SpMM_bias(rhs, bias, &out_ref);
csrblocksparse::CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
block_sparse_matrix(matrix);
std::string buffer;
std::size_t num_bytes = block_sparse_matrix.WriteToFlatBuffer(&buffer);
csrblocksparse::CsrBlockSparseMatrix<csrblocksparse::bfloat16, float, int16_t>
new_block_sparse_matrix(reinterpret_cast<const uint8_t*>(buffer.c_str()),
num_bytes);
new_block_sparse_matrix.SpMM_bias(rhs, bias, &out_test);
CheckResult(out_ref, out_test, kCols);
}
template <typename ComputeType, typename RhsType, typename OutType>
void CorrectnessCheckBlockSpMM(int rows, int cols, int block_height,
int block_width, float sparsity,
bool use_relu = false, int num_threads = 1,
int fatness = 1, bool test_matmul = false) {
using BiasType = typename TypeOfProduct<ComputeType, RhsType>::type;
MaskedSparseMatrix<float> matrix(rows, cols, sparsity, block_height,
block_width);
matrix.CastWeights<ComputeType>();
FatCacheAlignedVector<RhsType> rhs(cols, fatness);
CacheAlignedVector<BiasType> bias(rows);
FatCacheAlignedVector<OutType> out(rows, fatness);
bias.FillRandom();
rhs.FillRandom();
out.FillZero();
FatCacheAlignedVector<OutType> out_reference = out;
matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
std::move(sparse_matrix), std::move(bias));
num_threads = sparse_linear_layer.PrepareForThreads(num_threads);
// Checks that the result of applying each thread's portion serially is
// correct.
for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
sparse_linear_layer.SpMM_bias(rhs, &out, use_relu, thread_id);
}
CheckResult(out_reference, out, sparse_linear_layer.cols());
if (test_matmul) {
for (int thread_id = 0; thread_id < num_threads; ++thread_id) {
sparse_linear_layer.MatVec(rhs, use_relu, thread_id,
/*replicas=*/1, /*output_stride=*/0, &out);
}
CheckResult(out_reference, out, sparse_linear_layer.cols());
}
}
// Does:
// y = Ax + b;
// x = Ay + b;
// y = Ax + b;
//
// to make sure that dependent multiplies are correct.
template <typename ComputeType, typename RhsType, typename OutType>
void ThreadBody(
SpinBarrier* spin_barrier, int tid,
const SparseLinearLayer<ComputeType, RhsType>& sparse_linear_layer,
FatCacheAlignedVector<RhsType>* rhs, FatCacheAlignedVector<OutType>* out,
bool use_relu) {
sparse_linear_layer.SpMM_bias(*rhs, out, use_relu, tid);
spin_barrier->barrier();
sparse_linear_layer.SpMM_bias(*out, rhs, use_relu, tid);
spin_barrier->barrier();
sparse_linear_layer.SpMM_bias(*rhs, out, use_relu, tid);
}
template <typename ComputeType, typename RhsType, typename OutType>
void CorrectnessCheckBlockSpMM_MultiThread(int rows, int cols, int block_height,
int block_width, float sparsity,
bool use_relu = false,
int num_threads = 1,
int fatness = 1) {
typedef typename TypeOfProduct<ComputeType, RhsType>::type BiasType;
CHECK(rows == cols);
MaskedSparseMatrix<float> matrix(rows, cols, sparsity, block_height,
block_width);
matrix.CastWeights<ComputeType>();
FatCacheAlignedVector<RhsType> rhs(cols, fatness);
FatCacheAlignedVector<RhsType> rhs_mt(cols, fatness);
CacheAlignedVector<BiasType> bias(rows);
FatCacheAlignedVector<OutType> out(rows, fatness);
bias.FillOnes();
rhs.FillOnes();
rhs_mt.FillOnes();
out.FillZero();
FatCacheAlignedVector<OutType> out_reference = out;
matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
matrix.SpMM_bias(out_reference, bias, &rhs, use_relu);
matrix.SpMM_bias(rhs, bias, &out_reference, use_relu);
CsrBlockSparseMatrix<ComputeType, RhsType> sparse_matrix(matrix);
num_threads = sparse_matrix.PrepareForThreads(num_threads,
/*cache_line_size=*/1);
SparseLinearLayer<ComputeType, RhsType> sparse_linear_layer(
std::move(sparse_matrix), std::move(bias));
csrblocksparse::LaunchOnThreadsWithBarrier(
num_threads, ThreadBody<ComputeType, RhsType, OutType>,
sparse_linear_layer, &rhs_mt, &out, use_relu);
CheckResult(out_reference, out, cols);
}
} // namespace
TEST(MaskedSparseCorrectness, HandCoded) {
const int kRows = 8;
const int kCols = 8;
// clang-format off
std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
0, 1, 0, 1, 0, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1};
// clang-format on
std::vector<float> values(kRows * kCols, 1.f);
std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
CacheAlignedVector<float> rhs(kCols);
CacheAlignedVector<float> bias(kRows);
CacheAlignedVector<float> out(kRows);
bias.FillOnes();
rhs.FillOnes();
out.FillZero();
MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
std::move(bias));
masked_linear_layer.SpMM_bias(rhs, &out);
for (int i = 0; i < kRows; ++i) {
EXPECT_EQ(answer[i], out[i]);
}
}
TEST(MaskedSparseCorrectness, HandCodedFatVector) {
const int kRows = 8;
const int kCols = 8;
// clang-format off
std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
0, 1, 0, 1, 0, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1};
// clang-format on
std::vector<float> values(kRows * kCols, 1.f);
std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
const int kMaxWidth = 5;
for (int width = 5; width <= kMaxWidth; ++width) {
FatCacheAlignedVector<float> rhs(kCols, width);
CacheAlignedVector<float> bias(kRows);
FatCacheAlignedVector<float> out(kRows, width);
bias.FillOnes();
rhs.FillOnes();
out.FillZero();
MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
std::move(bias));
masked_linear_layer.SpMM_bias(rhs, &out);
for (int i = 0; i < kRows; ++i) {
for (int width = 0; width < kMaxWidth; ++width) {
EXPECT_EQ(answer[i], out[i + width * kRows]);
}
}
}
}
TEST(CsrBlockSparseMatrix, HandCodedMultiThread) {
const int kRows = 8;
const int kCols = 8;
// clang-format off
std::vector<int> mask = {1, 1, 0, 0, 0, 1, 1, 1,
0, 1, 0, 1, 0, 1, 0, 1,
1, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 1, 1, 0, 0,
1, 1, 0, 0, 1, 1, 0, 0,
1, 0, 0, 0, 0, 1, 0, 1};
// clang-format on
std::vector<float> values(kRows * kCols, 1.f);
std::vector<float> answer = {6.f, 5.f, 6.f, 1.f, 9.f, 3.f, 5.f, 4.f};
MaskedSparseMatrix<float> matrix(kRows, kCols, mask.data(), values.data());
CacheAlignedVector<float> rhs(kCols);
CacheAlignedVector<float> bias(kRows);
CacheAlignedVector<float> out(kRows);
bias.FillOnes();
rhs.FillOnes();
out.FillZero();
CacheAlignedVector<float> bias_csr = bias;
CsrBlockSparseMatrix<bfloat16, float> sparse_matrix(matrix);
MaskedLinearLayer<float> masked_linear_layer(std::move(matrix),
std::move(bias));
masked_linear_layer.SpMM_bias(rhs, &out);
SparseLinearLayer<bfloat16, float> sparse_linear_layer(
std::move(sparse_matrix), std::move(bias_csr));
sparse_linear_layer.PrepareForThreads(2, /*cache_line_size=*/1);
CacheAlignedVector<float> out_tmp(kRows);
const bool kUseRelu = false;
sparse_linear_layer.SpMM_bias(rhs, &out_tmp, kUseRelu, /*tid=*/0);
sparse_linear_layer.SpMM_bias(rhs, &out_tmp, kUseRelu, /*tid=*/1);
for (int i = 0; i < kRows; ++i) {
EXPECT_EQ(answer[i], out_tmp[i]);
}
}
TEST(TestCasts, TestBfloat16) {
const int kRows = 1000;
const int kCols = 100;
const float kSparsity = 0.f;
MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
MaskedSparseMatrix<float> matrix_bfloat16(kRows, kCols, matrix.mask().data(),
matrix.values().data());
matrix_bfloat16.CastWeights<bfloat16>();
CheckResult(matrix.values(), matrix_bfloat16.values(), kCols);
}
TEST(TestCasts, TestFP16) {
const int kRows = 1000;
const int kCols = 100;
const float kSparsity = 0.f;
MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
#if !defined __arm__ && !defined __aarch64__
// Conversion doesn't handle denormals, so flush denormals to zero first.
for (int i = 0; i < matrix.values().size(); ++i) {
if (matrix.data()[i] < 1. / static_cast<float>(1 << 14))
matrix.data()[i] = 0.f;
}
#endif
MaskedSparseMatrix<float> matrix_fp16(kRows, kCols, matrix.mask().data(),
matrix.values().data());
matrix_fp16.CastWeights<csrblocksparse::fp16>();
CheckResult(matrix.values(), matrix_fp16.values(), kCols);
}
TEST(TestCasts, TestFixed16) {
const int kRows = 100000;
const int kCols = 1;
const float kSparsity = 0.f;
MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
// Relative error for fixed point is high near 0.
for (int i = 0; i < matrix.values().size(); ++i) {
// 1.1e-3 is based on the max error of .013 and a grid spacing of 1 / 2**16
// == 3e-5. 3e-5 / .013 / 2 = 1.1e-3.
if (std::abs(matrix.data()[i]) < 1.1e-3) {
matrix.data()[i] = 0.f;
}
}
MaskedSparseMatrix<float> matrix_fixed16 = matrix;
matrix_fixed16.CastWeights<csrblocksparse::fixed16</*ExponentBits=*/0>>();
CheckResult(matrix.values(), matrix_fixed16.values(), kCols);
}
TEST(TestCasts, TestFixed32) {
const int kRows = 100000;
const int kCols = 1;
const float kSparsity = 0.f;
MaskedSparseMatrix<float> matrix(kRows, kCols, kSparsity);
MaskedSparseMatrix<float> matrix_fixed32 = matrix;
matrix_fixed32.CastWeights<csrblocksparse::fixed32</*ExponentBits=*/0>>();
CheckResult(matrix.values(), matrix_fixed32.values(), kCols);
}
template <typename ComputeType, typename RhsType, typename OutType>
void TestSpMM(int block_width, int block_height, int fatness,
bool test_matmul = false) {
std::array<bool, 2> use_relu = {false, true};
std::vector<float> sparsity_levels = {.5, .8, .9, .95, .98};
std::vector<std::pair<int, int>> sizes = {{8, 8}, {128, 128}, {128, 64},
{256, 192}, {512, 512}, {1024, 512},
{384, 384}, {512, 384}};
for (int num_threads = 1; num_threads < 2 + test_matmul; ++num_threads) {
for (const auto& relu : use_relu) {
for (const auto& sparsity : sparsity_levels) {
for (const auto& size : sizes) {
int rows, cols;
std::tie(rows, cols) = size;
CorrectnessCheckBlockSpMM<ComputeType, RhsType, OutType>(
rows, cols, block_height, block_width, sparsity, relu,
num_threads, fatness, test_matmul);
}
}
}
}
}
template <typename ComputeType, typename RhsType, typename OutType>
void TestSpMM_MultiThread(int block_width, int block_height, int fatness) {
std::array<bool, 2> use_relu = {false, true};
std::vector<float> sparsity_levels = {.5, .8, .9, .95, .98};
std::vector<std::pair<int, int>> sizes = {
{48, 48}, {128, 128}, {512, 512}, {384, 384}};
for (int num_threads = 1; num_threads < 5; ++num_threads) {
for (const auto& relu : use_relu) {
for (const auto& sparsity : sparsity_levels) {
for (const auto& size : sizes) {
int rows, cols;
std::tie(rows, cols) = size;
CorrectnessCheckBlockSpMM_MultiThread<ComputeType, RhsType, OutType>(
rows, cols, block_height, block_width, sparsity, relu,
num_threads, fatness);
}
}
}
}
}
template <typename DataType>
void TestSumVectors(int start = 0, int end = -1, int size = 6) {
std::vector<DataType> values;
std::vector<DataType> answer;
for (int i = 1; i < size + 1; ++i) {
const float x = static_cast<float>(i);
values.push_back(static_cast<DataType>(x));
answer.push_back(static_cast<DataType>(x * 2));
}
if (end == -1) {
end = values.size();
}
csrblocksparse::CacheAlignedVector<DataType> result(values.size());
csrblocksparse::CacheAlignedVector<DataType> values_aligned(values);
detail::SumVectors(start, end, values_aligned.data(), values_aligned.data(),
result.data());
for (int i = start; i < end; ++i) {
EXPECT_EQ(static_cast<float>(answer[i]), static_cast<float>(result[i]));
}
}
TEST(CsrBlockSparseMatrix, SumVectors_Generic) {
TestSumVectors<float>();
TestSumVectors<float>(1);
TestSumVectors<float>(1, 4);
}
TEST(CsrBlockSparseMatrix, SumVectors_Bfloat16) {
TestSumVectors<csrblocksparse::bfloat16>();
TestSumVectors<csrblocksparse::bfloat16>(1);
TestSumVectors<csrblocksparse::bfloat16>(1, 4);
}
// For SIMD-optimized SumVectors, the memory of the vector should be at least
// |kSIMDWidth * sizeof(float)| long, and the start position has to be an
// aligned memory location. So setting |size| to be 100 to be safe and
// |start| to be 0 (|start| == 1 is not aligned).
TEST(CsrBlockSparseMatrix, SumVectors_Fixed16) {
TestSumVectors<csrblocksparse::fixed16<8>>(0, -1, 100);
TestSumVectors<csrblocksparse::fixed16<8>>(0, 4, 100);
}
TEST(CsrBlockSparseMatrix, SumVectors_Fixed32) {
TestSumVectors<csrblocksparse::fixed32<11>>(0, -1, 100);
TestSumVectors<csrblocksparse::fixed32<11>>(0, 4, 100);
}
TEST(CsrBlockSparseMatrix, SpMM_Block4x4_Bfloat16) {
TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
// This actually uses multiple threads, and uses the output as the input for
// multiple steps to test that synchronization and memory visibility is
// working correctly.Requires square matrices.
TEST(CsrBlockSparseMatrix, SpMV_4x4MultiThreading_Bfloat16) {
TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_4x4MultiThreading_Bfloat16) {
TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block1x1_Bfloat16) {
TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_Block1x1_Bfloat16) {
TestSpMM<csrblocksparse::bfloat16, float, float>(/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
// This actually uses multiple threads, and uses the output as the input for
// multiple steps to test that synchronization and memory visibility is
// working correctly.Requires square matrices.
TEST(CsrBlockSparseMatrix, SpMV_1x1MultiThreading_Bfloat16) {
TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_1x1MultiThreading_Bfloat16) {
TestSpMM_MultiThread<csrblocksparse::bfloat16, float, float>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block4x4_float) {
TestSpMM<float, float, float>(/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/1,
/*test_matmul=*/true);
}
TEST(CsrBlockSparseMatrix, SpMM_Block4x4_float) {
TestSpMM<float, float, float>(/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
// This actually uses multiple threads, and uses the output as the input for
// multiple steps to test that synchronization and memory visibility is
// working correctly.Requires square matrices.
TEST(CsrBlockSparseMatrix, SpMV_4x4MultiThreading_float) {
TestSpMM_MultiThread<float, float, float>(/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_4x4MultiThreading_float) {
TestSpMM_MultiThread<float, float, float>(/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block1x1_float) {
TestSpMM<float, float, float>(/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_Block1x1_float) {
TestSpMM<float, float, float>(/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
// This actually uses multiple threads, and uses the output as the input for
// multiple steps to test that synchronization and memory visibility is
// working correctly.Requires square matrices.
TEST(CsrBlockSparseMatrix, SpMV_1x1MultiThreading_float) {
TestSpMM_MultiThread<float, float, float>(/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_1x1MultiThreading_float) {
TestSpMM_MultiThread<float, float, float>(/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_32) {
TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
typename csrblocksparse::TypeOfProduct<
csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/1,
/*test_matmul=*/true);
}
TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_32) {
TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
typename csrblocksparse::TypeOfProduct<
csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_32) {
TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
typename csrblocksparse::TypeOfProduct<
csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_32) {
TestSpMM<csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>,
typename csrblocksparse::TypeOfProduct<
csrblocksparse::fixed16<4>, csrblocksparse::fixed16<4>>::type>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_16) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed16<8>>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/1,
/*test_matmul=*/true);
}
TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_16) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed16<8>>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_16) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed16<8>>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_16) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed16<8>>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block4x4_fixed16x16_32_unmatched) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed32<13>>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/1,
/*test_matmul=*/true);
}
TEST(CsrBlockSparseMatrix, SpMM_Block4x4_fixed16x16_32_unmatched) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed32<13>>(
/*block_width=*/4,
/*block_height=*/4,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, SpMV_Block1x1_fixed16x16_32_unmatched) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed32<13>>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/1);
}
TEST(CsrBlockSparseMatrix, SpMM_Block1x1_fixed16x16_32_unmatched) {
TestSpMM<csrblocksparse::fixed16<5>, csrblocksparse::fixed16<5>,
csrblocksparse::fixed32<13>>(
/*block_width=*/1,
/*block_height=*/1,
/*fatness=*/7);
}
TEST(CsrBlockSparseMatrix, RhsIndicesDeltasRoundTrip) {
MaskedSparseMatrix<float> matrix(/*rows=*/256, /*cols=*/256,
/*sparsity=*/0.9, /*block_height=*/4,
/*block_width=*/4);
CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
CacheAlignedVector<int16_t> copy_indices = sparse_matrix.rhs_indices();
sparse_matrix.ComputeColDeltas();
sparse_matrix.ComputeRHSIndices();
// They get padded when created, so the newer one could be bigger.
EXPECT_LE(copy_indices.size(), sparse_matrix.rhs_indices().size());
for (int i = 0; i < copy_indices.size(); ++i) {
EXPECT_EQ(copy_indices[i], sparse_matrix.rhs_indices()[i]) << "i=" << i;
}
}
// Tests that a Layer that is split into 2 by columns (inputs) computes the same
// result as the original layer.
TEST(CsrBlockSparseMatrix, SplitByCol) {
int kRows = 1024;
int kCols = 1024;
MaskedSparseMatrix<float> matrix(kRows, kCols, 0.95, /*block_height=*/4,
/*block_width=*/4);
FatCacheAlignedVector<float> rhs(kCols, /*cols=*/1);
CacheAlignedVector<float> bias(kRows);
FatCacheAlignedVector<float> out1(kRows, /*cols=*/1);
FatCacheAlignedVector<float> out2(kRows, /*cols=*/1);
bias.FillRandom();
rhs.FillRandom();
out1.FillZero();
out2.FillZero();
FatCacheAlignedVector<float> out_reference = out1;
CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
std::move(bias));
sparse_linear_layer.PrepareForThreads(1);
sparse_linear_layer.SpMM_bias(rhs, &out_reference, /*relu=*/false,
/*tid=*/0);
// Split the layer into 2 parts.
SparseLinearLayer<float, float> part1, part2;
sparse_linear_layer.SplitInputs(&part1, &part2);
part1.PrepareForThreads(1);
part2.PrepareForThreads(1);
EXPECT_EQ(kRows, part1.rows());
EXPECT_EQ(kCols / 2, part1.cols());
EXPECT_EQ(kRows, part2.rows());
EXPECT_EQ(kCols / 2, part2.cols());
MutableVectorView<float> rhs1(&rhs, 0, kCols / 2);
MutableVectorView<float> rhs2(&rhs, kCols / 2, kCols / 2);
for (int i = 0; i < kCols / 2; ++i) {
EXPECT_FLOAT_EQ(rhs[i], rhs1.data()[i]);
EXPECT_FLOAT_EQ(rhs[i + kCols / 2], rhs2.data()[i]);
}
part1.SpMM_bias(rhs1, &out1, /*relu=*/false, /*tid=*/0);
part2.SpMM_bias(rhs2, &out2, /*relu=*/false, /*tid=*/0);
// Check that out1 + out2 = out_reference.
for (int i = 0; i < kRows; ++i) {
EXPECT_NEAR(out_reference[i], out1[i] + out2[i], 2e-5)
<< " i=" << i << " out1=" << out1[i] << " out2=" << out2[i];
}
}
// Tests that a Layer that is split into 2 by rows (outputs) computes the same
// result as the original layer.
TEST(CsrBlockSparseMatrix, SplitByRow) {
int kRows = 1024;
int kCols = 1024;
MaskedSparseMatrix<float> matrix(kRows, kCols, 0.95, /*block_height=*/4,
/*block_width=*/4);
FatCacheAlignedVector<float> rhs(kCols, /*cols=*/1);
CacheAlignedVector<float> bias(kRows);
FatCacheAlignedVector<float> out1(kRows, /*cols=*/1);
FatCacheAlignedVector<float> out2(kRows, /*cols=*/1);
bias.FillRandom();
rhs.FillRandom();
out1.FillZero();
out2.FillZero();
FatCacheAlignedVector<float> out_reference = out1;
CsrBlockSparseMatrix<float, float> sparse_matrix(matrix);
SparseLinearLayer<float, float> sparse_linear_layer(std::move(sparse_matrix),
std::move(bias));
sparse_linear_layer.PrepareForThreads(1);
sparse_linear_layer.SpMM_bias(rhs, &out_reference, /*relu=*/false,
/*tid=*/0);
// Split the layer into 2 parts.
SparseLinearLayer<float, float> part1, part2;
sparse_linear_layer.SplitOutputs(&part1, &part2);
part1.PrepareForThreads(1);
part2.PrepareForThreads(1);
EXPECT_EQ(kRows / 2, part1.rows());
EXPECT_EQ(kCols, part1.cols());
EXPECT_EQ(kRows / 2, part2.rows());
EXPECT_EQ(kCols, part2.cols());
MutableVectorView<float> out2a(&out2, 0, kRows / 2);
MutableVectorView<float> out2b(&out2, kRows / 2, kRows / 2);
part1.SpMM_bias(rhs, &out2a, /*relu=*/false, /*tid=*/0);
part2.SpMM_bias(rhs, &out2b, /*relu=*/false, /*tid=*/0);
// Check that out2 = out_reference.
for (int i = 0; i < kRows; ++i) {
EXPECT_NEAR(out_reference[i], out2[i], 2e-5)
<< " i=" << i << " out1=" << out_reference[i] << " out2=" << out2[i];
}
}
TEST(CsrBlockSparseMatrix, MutableVectorView) {
const int kRows = 1024;
const int kCols = 1024;
const int kFatness = 2;
std::vector<float> values(kRows * kCols, 1.f);
std::vector<int> mask(kRows * kCols);
for (int i = 0; i < mask.size(); ++i) mask[i] = i % 2;
auto masked_matrix =
MaskedSparseMatrix<float>(kRows, kCols, mask.data(), values.data());
auto sparse_matrix = CsrBlockSparseMatrix<bfloat16, float>(masked_matrix);
FatCacheAlignedVector<float> x(kCols, kFatness);
x.FillOnes();
CacheAlignedVector<float> bias(kRows);
bias.FillZero();
// First check that we can use spans as output. Split a multiplication
// into upper and lower halves times the full vector:
// --------------- x t
// | | x t
// | | x t
// --------------- =
// | | x b
// | | x b
// --------------- x b
FatCacheAlignedVector<float> out(kRows, kFatness);
FatCacheAlignedVector<float> out_view(kRows, kFatness);
MutableVectorView<float> out_view_top(&out_view, 0, kRows / 2);
MutableVectorView<float> out_view_bottom(&out_view, kRows / 2, kRows / 2);
sparse_matrix.SpMM_bias(x, bias, &out);
auto masked_matrix_top =
MaskedSparseMatrix<float>(kRows / 2, kCols, mask.data(), values.data());
auto masked_matrix_bottom = MaskedSparseMatrix<float>(
kRows / 2, kCols, mask.data() + kRows * kCols / 2,
values.data() + kRows * kCols / 2);
auto sparse_matrix_top =
CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_top);
auto sparse_matrix_bottom =
CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_bottom);
sparse_matrix_top.SpMM_bias(x, bias, &out_view_top);
sparse_matrix_bottom.SpMM_bias(x, bias, &out_view_bottom);
CheckResult(out, out_view, kCols);
// Check that we can use a span as an input vector. Multiply upper left
// portion of the matrix by the top half of the vector.
// ---------------
// |oooooo | x q
// |oooooo | x q
// | | =
// | |
// ---------------
auto masked_matrix_quarter = MaskedSparseMatrix<float>(
kRows / 2, kCols / 2, mask.data(), values.data());
auto sparse_matrix_quarter =
CsrBlockSparseMatrix<bfloat16, float>(masked_matrix_quarter);
MutableVectorView<float> x_top(&x, 0, kCols / 2);
FatCacheAlignedVector<float> out_correct(kRows / 2, /*cols=*/2);
for (int i = 0; i < kFatness * (kRows / 2); ++i) out_correct[i] = 256.f;
MutableVectorView<float> bias_top(&bias, 0, kRows / 2);
FatCacheAlignedVector<float> out_quarter(kRows / 2, kFatness);
sparse_matrix_quarter.SpMM_bias(x_top, bias_top, &out_quarter);
CheckResult(out_correct, out_quarter, kCols / 2);
}
namespace {
bool skip_test(const absl::Status& status, absl::string_view msg) {
if (!status.ok()) {
LOG(INFO) << "Couldn't load " << msg << ", skipping test " << status;
return true;
}
return false;
}
} // namespace
TEST(CsrBlockSparseMatrix, ModelMatrices_Bfloat16) {
std::vector<std::string> names = {
"768_512_95_4x4_wavernn_gru_", "768_512_95_4x4_coarseproj_",
"768_512_95_4x4_coarselogit_", "768_512_95_4x4_fineproj_",
"768_512_95_4x4_finelogit_", "lyra_conv1d_"};
const std::string kPath =
#if defined __arm__ || defined __aarch64__
"/data/local/tmp/";
#else
(ghc::filesystem::current_path() / kTestdataPath).string();
#endif
for (auto& layer_name : names) {
SparseLinearLayer<bfloat16, float> sparse_linear_layer;
auto status = LoadSparseLayer<bfloat16, float>(layer_name, /*zipped=*/true,
&sparse_linear_layer, kPath);
// If the files don't exist on the device we're running on, just skip this
// test and log that it was skipped.
if (skip_test(status, layer_name)) return;
int rows = sparse_linear_layer.rows();
int cols = sparse_linear_layer.cols();
MaskedLinearLayer<float> masked_linear_layer;
status = LoadMaskedLayer<float>(layer_name, /*zipped=*/true,
&masked_linear_layer, kPath);
if (skip_test(status, layer_name)) return;
masked_linear_layer.CastWeights<csrblocksparse::bfloat16>();
CacheAlignedVector<float> rhs(cols);
CacheAlignedVector<float> out_ref(rows);
CacheAlignedVector<float> out_spmv(rows);
rhs.FillRandom();
out_ref.FillZero();
out_spmv.FillZero();
std::array<bool, 2> use_relus = {false, true};
for (bool use_relu : use_relus) {
masked_linear_layer.SpMM_bias(rhs, &out_ref, use_relu);
sparse_linear_layer.SpMM_bias(rhs, &out_spmv, use_relu);
CheckResult(out_ref, out_spmv, cols);
}
}
}
TEST(CsrBlockSparseMatrix, ModelMatrices_float) {
std::vector<std::string> names = {
"768_512_95_4x4_wavernn_gru_", "768_512_95_4x4_coarseproj_",
"768_512_95_4x4_coarselogit_", "768_512_95_4x4_fineproj_",
"768_512_95_4x4_finelogit_", "lyra_conv1d_"};
const std::string kPath =
#if defined __arm__ || defined __aarch64__
"/data/local/tmp/";
#else
(ghc::filesystem::current_path() / kTestdataPath).string();
#endif
for (auto& layer_name : names) {
SparseLinearLayer<float, float> sparse_linear_layer;
auto status = LoadSparseLayer<float, float>(layer_name, /*zipped=*/true,
&sparse_linear_layer, kPath);
// If the files don't exist on the device we're running on, just skip this
// test and log that it was skipped.
if (skip_test(status, layer_name)) return;
int rows = sparse_linear_layer.rows();
int cols = sparse_linear_layer.cols();
MaskedLinearLayer<float> masked_linear_layer;
status = LoadMaskedLayer<float>(layer_name, /*zipped=*/true,
&masked_linear_layer, kPath);
if (skip_test(status, layer_name)) return;
CacheAlignedVector<float> rhs(cols);
CacheAlignedVector<float> out_ref(rows);
CacheAlignedVector<float> out_spmv(rows);
rhs.FillRandom();
out_ref.FillZero();
out_spmv.FillZero();
std::array<bool, 2> use_relus = {false, true};
for (bool use_relu : use_relus) {
masked_linear_layer.SpMM_bias(rhs, &out_ref, use_relu);
sparse_linear_layer.SpMM_bias(rhs, &out_spmv, use_relu);
CheckResult(out_ref, out_spmv, cols);
}
}
}
#undef SKIP_TEST
} // namespace csrblocksparse