/* * Copyright 2021 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef LYRA_CODEC_SPARSE_MATMUL_VECTOR_CACHE_ALIGNED_VECTOR_H_ #define LYRA_CODEC_SPARSE_MATMUL_VECTOR_CACHE_ALIGNED_VECTOR_H_ #if defined __aarch64__ #include #endif #if defined __AVX__ || defined __AVX2__ #include #endif #include #include #include #include #include #include #include #include "absl/strings/str_format.h" #include "sparse_matmul/numerics/fast_transcendentals.h" #include "sparse_matmul/numerics/fixed_types.h" #include "sparse_matmul/numerics/type_utils.h" #include "sparse_matmul/os/coop_threads.h" #include "sparse_matmul/vector/aligned_malloc.h" namespace csrblocksparse { template class MutableVectorView; template class VectorView; // CacheAlignedVector is a simple vector-like class that makes sure its // underlying buffer is aligned to a |kCacheLineSize| boundary. It is meant // for numeric computation and cannot be used to store objects that are // not POD as it will neither call their constructors nor destructors. // // It is meant to be used with the CSRBlockSparseMatrix class for // implenting basic neural network layers composed of SpMV. // // This class is thread compatible. template class CacheAlignedVector { static_assert(std::is_pod::value, "CacheAlignedVector can only be" " used with POD"); public: using value_type = DataType; explicit CacheAlignedVector(std::size_t size) : size_(size), data_(nullptr) { gen_ = absl::make_unique(0); data_ = reinterpret_cast( aligned_malloc(size_ * sizeof(DataType), kCacheLineSize)); } explicit CacheAlignedVector(const std::vector& input) : size_(input.size()), data_(nullptr) { gen_ = absl::make_unique(0); data_ = reinterpret_cast( aligned_malloc(size_ * sizeof(DataType), kCacheLineSize)); memcpy(data_, input.data(), size_ * sizeof(DataType)); } template explicit CacheAlignedVector(const std::vector& input) : size_(input.size()), data_(nullptr) { gen_ = absl::make_unique(0); data_ = reinterpret_cast( aligned_malloc(size_ * sizeof(DataType), kCacheLineSize)); for (int i = 0; i < size_; ++i) data_[i] = static_cast(input.data()[i]); } CacheAlignedVector(const DataType* input, int size) : size_(size), data_(nullptr) { gen_ = absl::make_unique(0); data_ = reinterpret_cast( aligned_malloc(size_ * sizeof(DataType), kCacheLineSize)); memcpy(data_, input, size_ * sizeof(DataType)); } template explicit CacheAlignedVector(const InputType* input, int size) : size_(size), data_(nullptr) { gen_ = absl::make_unique(0); data_ = reinterpret_cast( aligned_malloc(size_ * sizeof(DataType), kCacheLineSize)); for (int i = 0; i < size_; ++i) data_[i] = static_cast(input[i]); } CacheAlignedVector() : size_(0), data_(nullptr) {} ~CacheAlignedVector() { aligned_free(data_); data_ = nullptr; size_ = 0; } // Copies are _deep_ copies CacheAlignedVector(CacheAlignedVector const& other) : size_(0), data_(nullptr), gen_(nullptr) { if (other.gen_) gen_ = absl::make_unique(std::minstd_rand(*other.gen_)); this->resize(other.size()); memcpy(data_, other.data(), size_ * sizeof(DataType)); } // Copies a slice of the input. CacheAlignedVector(CacheAlignedVector const& other, int start, int end) : size_(0), data_(nullptr), gen_(nullptr) { if (other.gen_) gen_ = absl::make_unique(std::minstd_rand(*other.gen_)); this->resize(end - start); memcpy(data_, other.data() + start, size_ * sizeof(DataType)); } void operator=(CacheAlignedVector const& other) { if (other.gen_) gen_ = absl::make_unique(std::minstd_rand(*other.gen_)); else gen_.reset(nullptr); this->resize(other.size()); memcpy(data_, other.data(), size_ * sizeof(DataType)); } CacheAlignedVector(CacheAlignedVector&& other) : size_(0), data_(nullptr), gen_(std::move(other.gen_)) { size_ = other.size_; data_ = other.data_; other.size_ = 0; other.data_ = nullptr; } CacheAlignedVector& operator=( CacheAlignedVector&& other) { aligned_free(data_); if (other.gen_) gen_ = absl::make_unique(std::move(*other.gen_)); else gen_.reset(nullptr); size_ = other.size_; data_ = other.data_; other.size_ = 0; other.data_ = nullptr; return *this; } VectorView AsView() const { return VectorView(this->data(), this->size(), 1); } MutableVectorView AsMutableView() { return MutableVectorView(this->data(), this->size(), 1); } // Copies the |split_points| to use in ReducingSample. void PrepareForThreads(const std::vector& split_points, int block_height) { maxes_.resize(split_points.size() - 1); thread_starts_ = split_points; for (int t = 0; t < thread_starts_.size(); ++t) { thread_starts_[t] *= block_height; } } void FillRandom(float min = -10.f, float max = 10.f) { // 10 is smaller than any nonzero bound of the range of any data type. std::uniform_real_distribution dist(min, max); for (std::size_t i = 0; i < size_; i++) { data_[i] = DataType(dist(*gen_)); } } void FillZero() { for (std::size_t i = 0; i < size_; i++) { data_[i] = DataType(0.f); } } void FillOnes() { for (std::size_t i = 0; i < size_; i++) { data_[i] = DataType(1.f); } } void FillWith(const DataType& value) { for (std::size_t i = 0; i < size_; i++) { data_[i] = value; } } // Interprets |data_| as logits and samples from the distribution, this // version operates IN PLACE and uses an internal random source. template typename std::enable_if::value, int>::type Sample( float temperature = 1.f) { return Sample(temperature, gen_.get(), this); } // Interprets |data_| as logits and samples. This version requires the random // source and temporary memory to be passed in. It is thread safe assuming // no other threads are using the generator and temporary memory. #if defined __aarch64__ template typename std::enable_if::value, int>::type Sample( float temperature, std::minstd_rand* gen, CacheAlignedVector* scratch) const { DCHECK(scratch->size() >= size_); // Round down to nearest multiple of 8. int SIMD_iterations = 8 * (size_ / 8); float* scratch_ptr = scratch->data(); std::uniform_real_distribution dist; float random_number = dist(*gen); float32x4_t sum = vdupq_n_f32(0.f); float32x4_t sum1 = vdupq_n_f32(0.f); float32x4_t max_value = vdupq_n_f32(std::numeric_limits::lowest()); float32x4_t max_value1 = vdupq_n_f32(std::numeric_limits::lowest()); float32x4_t inv_temp = vdupq_n_f32(1.f / temperature); // Compute sum of exp(x) for the denominator. // Hand unroll by 2, gives speed improvement. constexpr int kUnrollFactor = 2; constexpr int kElementsPerIter = kUnrollFactor * kSIMDWidth; for (std::size_t i = 0; i < SIMD_iterations; i += kElementsPerIter) { max_value = vmaxq_f32(vld1q_f32(data_ + i), max_value); max_value1 = vmaxq_f32(vld1q_f32(data_ + i + 4), max_value1); } // Pairwise reduction. max_value = vpmaxq_f32(max_value, max_value1); // Duplicate (dupq) maximum across vector (maxnmvq). float scalar_max_value = vmaxvq_f32(max_value); for (int i = SIMD_iterations; i < size_; ++i) { scalar_max_value = std::max(data_[i], scalar_max_value); } max_value = vdupq_n_f32(scalar_max_value); for (std::size_t i = 0; i < SIMD_iterations; i += kElementsPerIter) { // Load and multiply by temperature. float32x4_t x = vmulq_f32(vsubq_f32(vld1q_f32(data_ + i), max_value), inv_temp); float32x4_t x1 = vmulq_f32(vsubq_f32(vld1q_f32(data_ + i + 4), max_value), inv_temp); float32x4_t exponent = fast_exp(x); float32x4_t exponent1 = fast_exp(x1); sum = vaddq_f32(sum, exponent); sum1 = vaddq_f32(sum1, exponent1); vst1q_f32(scratch_ptr + i, exponent); vst1q_f32(scratch_ptr + i + 4, exponent1); } // Horizontally reduce the two sums. sum = vpaddq_f32(sum, sum1); sum = vpaddq_f32(sum, sum); float denom = vgetq_lane_f32(sum, 0) + vgetq_lane_f32(sum, 1); for (int i = SIMD_iterations; i < size_; ++i) { float x = (data_[i] - scalar_max_value) / temperature; float x_exp = expf(x); denom += x_exp; scratch_ptr[i] = x_exp; } // Note: rather than normalize all the probabilities, we can just // apply the inverse normalization to the random number. random_number *= denom; // Now do the scan in serial, return as soon as possible. // TODO(b/188821456): This could be made into a parallel SIMD scan // followed by a binary search, for a small speedup. float cumsum = 0.f; for (std::size_t i = 0; i < size_; i++) { cumsum += scratch_ptr[i]; if (cumsum >= random_number) return i; } return size_ - 1; } template static inline int32x4_t vmul_temp_fixed(int32x4_t x, int32x2_t inv_temp) { int32x2_t xh = vget_high_s32(x); int32x2_t xl = vget_low_s32(x); int32x2_t ph = vqrshrn_n_s64(vmull_s32(xh, inv_temp), Q::kMantissaBits); int32x2_t pl = vqrshrn_n_s64(vmull_s32(xl, inv_temp), Q::kMantissaBits); return vcombine_s32(pl, ph); } template static inline int float_to_fixed(float x) { return static_cast(x * (1 << Q::kMantissaBits)); } template static inline float fixed_to_float(int x) { const float inv_denom = 1.f / (1 << Q::kMantissaBits); return static_cast(x) * inv_denom; } template typename std::enable_if::value, int>::type Sample( float temperature, std::minstd_rand* gen, CacheAlignedVector* scratch) const { DCHECK(scratch->size() >= size_); // Round down to nearest multiple of 8. int SIMD_iterations = 8 * (size_ / 8); int* scratch_ptr = scratch->data(); float scalar_inv_temp = 1.f / temperature; int32x4_t sum = vdupq_n_s32(0); int32x4_t sum1 = vdupq_n_s32(0); int32x4_t max_value = vdupq_n_s32(std::numeric_limits::lowest()); int32x4_t max_value1 = vdupq_n_s32(std::numeric_limits::lowest()); int32x2_t inv_temp = vdup_n_s32(float_to_fixed(scalar_inv_temp)); // Compute sum of exp(x) for the denominator. // Hand unroll by 2, gives speed improvement. const int* data_ptr = reinterpret_cast(data_); constexpr int kUnrollFactor = 2; constexpr int kElementsPerIter = kUnrollFactor * kSIMDWidth; for (std::size_t i = 0; i < SIMD_iterations; i += kElementsPerIter) { max_value = vmaxq_s32(vld1q_s32(data_ptr + i), max_value); max_value1 = vmaxq_s32(vld1q_s32(data_ptr + i + kSIMDWidth), max_value1); } // Pairwise reduction. max_value = vpmaxq_s32(max_value, max_value1); int scalar_max_value = vmaxvq_s32(max_value); for (int i = SIMD_iterations; i < size_; ++i) { scalar_max_value = std::max(data_[i].raw_val(), scalar_max_value); } max_value = vdupq_n_s32(scalar_max_value); // We clip all loaded values to a lower bound of the lowest possible arg to // exp + the max value that we are going to subtract, to prevent underflow // in exp and also to avoid wrap-around with values that are already minint. int32x4_t clip_min = vdupq_n_s32(scalar_max_value - (80 << MantissaBitsOf::value)); for (std::size_t i = 0; i < SIMD_iterations; i += kElementsPerIter) { // Load and multiply by temperature. int32x4_t loaded = vmaxq_s32(vld1q_s32(data_ptr + i), clip_min); int32x4_t x = vmul_temp_fixed(vsubq_s32(loaded, max_value), inv_temp); loaded = vmaxq_s32(vld1q_s32(data_ptr + i + kSIMDWidth), clip_min); int32x4_t x1 = vmul_temp_fixed(vsubq_s32(loaded, max_value), inv_temp); int32x4_t exponent = vcvtq_n_s32_f32(fast_exp_fixed(x), Q::kMantissaBits); int32x4_t exponent1 = vcvtq_n_s32_f32( fast_exp_fixed(x1), Q::kMantissaBits); sum = vaddq_s32(sum, exponent); sum1 = vaddq_s32(sum1, exponent1); vst1q_s32(scratch_ptr + i, exponent); vst1q_s32(scratch_ptr + i + kSIMDWidth, exponent1); } // Horizontally reduce the two sums. sum = vpaddq_s32(sum, sum1); sum = vpaddq_s32(sum, sum); float denom = fixed_to_float(vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1)); for (int i = SIMD_iterations; i < size_; ++i) { float x_exp = fast_exp_fixed( DataType((data_[i].raw_val() - scalar_max_value) * scalar_inv_temp)); denom += x_exp; scratch_ptr[i] = float_to_fixed(x_exp); } // Note: rather than normalize all the probabilities, we can just // apply the inverse normalization to the random number. std::uniform_real_distribution dist; int random_number = float_to_fixed(dist(*gen) * denom); // Now do the scan in serial, return as soon as possible. // TODO(b/188821456): This could be made into a parallel SIMD scan // followed by a binary search, for a small speedup. int cumsum = 0; for (std::size_t i = 0; i < size_; i += kSIMDWidth) { int32x4_t next_vals = vld1q_s32(&scratch_ptr[i]); cumsum += vaddvq_s32(next_vals); if (cumsum >= random_number) { int high_sum = vaddv_s32(vget_high_s32(next_vals)); if (cumsum - high_sum > random_number) { // One of the lower ones. return (cumsum - high_sum - scratch_ptr[i + 1] > random_number) ? i : i + 1; } else { // One of the upper ones. return (cumsum - scratch_ptr[i + 3] > random_number) ? i + 2 : i + 3; } } } return size_ - 1; } #endif // defined __aarch64__ template #if defined __aarch64__ typename std::enable_if< !std::is_same::value && !IsFixed32Type::value, int>::type #else int #endif Sample(float temperature, std::minstd_rand* gen, CacheAlignedVector* scratch, int tid = 0, SpinBarrier* barrier = nullptr) const { return ScalarSample(temperature, gen, scratch, tid, 0, -1, barrier); } int ScalarSample(float temperature, std::minstd_rand* gen, CacheAlignedVector* scratch, int tid = 0, const int mindex = 0, const int maxdex = -1, SpinBarrier* barrier = nullptr) const { // TODO(b/188821456) Don't ignore |tid| and |barrier|. Currently all threads // duplicate the same work and ignore |tid| and |barrier|, but they could // be used to execute a reducing max over the data before the exp operation. DCHECK_EQ(barrier, nullptr); DCHECK_EQ(tid, 0); DCHECK(scratch->size() >= size_); DCHECK(size_ % 8 == 0) << "CacheAlignedVector size must be a multiple of " "8 to allow for maximum SIMD and loop unroll, " "got " << size_ % 8; DCHECK(size_ > mindex >= 0); DCHECK((maxdex == -1) || (0 <= mindex < maxdex < size_)); int maxindex = maxdex > 0 ? maxdex : size_; float* scratch_ptr = scratch->data(); std::uniform_real_distribution dist; float random_number = dist(*gen); float sum = 0.f; float max_value = std::numeric_limits::lowest(); for (int i = mindex; i < maxindex; ++i) { max_value = std::max(max_value, static_cast(data_[i])); } float inv_temperature = 1.f / temperature; for (int i = mindex; i < maxindex; ++i) { float exponent = fast_exp((static_cast(data_[i]) - max_value) * inv_temperature); scratch_ptr[i] = exponent; sum += exponent; } // Note: rather than normalize all the probabilities, we can just // apply the inverse normalization to the random number. random_number *= sum; float cumsum = 0.f; for (std::size_t i = mindex; i < maxindex; i++) { cumsum += scratch_ptr[i]; if (cumsum >= random_number) return i; } return maxindex - 1; } #if defined __AVX2__ // Some AVX2-only code. // Returns the max of |data_| in the range [|t_start|, |t_end|). inline int ThreadMax(int t_start, int t_end) const { // Note: The AVX2 code requires that the number of threads and the output // size be a power of 2. For efficiency purposes, these should be checked // when preparing for threads in an architecture class. // The output size must be a power of 2 so the binary search for the sample // point works correctly. // The number of threads must be a power of 2 so that it nicely divides the // output size, which has to be a power of 2. __m256i maxes = _mm256_load_si256(reinterpret_cast<__m256i const*>(data_ + t_start)); for (int i = t_start + kSIMDWidth; i < t_end; i += kSIMDWidth) { __m256i data = _mm256_load_si256(reinterpret_cast<__m256i const*>(data_ + i)); maxes = _mm256_max_epi32(maxes, data); } // Max within the register. // Bring the top lane down to the bottom. __m256i other = _mm256_permute4x64_epi64(maxes, 0xe); maxes = _mm256_max_epi32(maxes, other); // Bring the 2nd 64 bits to the bottom. other = _mm256_shuffle_epi32(maxes, 0xe); maxes = _mm256_max_epi32(maxes, other); // Bring the 2nd 32 bits to the bottom. other = _mm256_shuffle_epi32(maxes, 1); maxes = _mm256_max_epi32(maxes, other); return _mm256_extract_epi32(maxes, 0); } // Applies exp (approximately) to the difference between |data_| and // |max_value|, storing the result in scratch, and returns the sum. template inline float ApplyExpAndSum(int max_value, float* scratch_ptr) { // Rough approximation for exp(x). See fast_exp_fixed. // Constant clipping limit on exp arg. Since its value is never positive, // we only need to clip on the negative side. constexpr int kClipLimit = -(80 << kMantissaBits); __m256i clip_val = _mm256_set1_epi32(kClipLimit); // Multiplication factor to convert x from log base e to log base 2, shifted // by an amount that lines up the binary point with the float32 // representation, after the multiplication static const int kLogFactor = (1 << (23 - kMantissaBits)) / logf(2.f); __m256i log_factor = _mm256_set1_epi32(kLogFactor); // Fix the exponent bias and add the additive fudge factor for the mantissa // to finish the approximate conversion. constexpr int kAddConstant = (127 << 23) - 366000; __m256i constant = _mm256_set1_epi32(kAddConstant); // Broadcast the max_value. __m256i max_val = _mm256_set1_epi32(max_value); // Add the max to the |clip_val|, so it can be used before the subtraction. clip_val = _mm256_add_epi32(clip_val, max_val); // The sum of the exps. __m256 sum1 = _mm256_setzero_ps(); for (int i = 0; i < size_; i += kSIMDWidth) { // |data_| - |max_value|. __m256i data = _mm256_load_si256(reinterpret_cast<__m256i const*>(data_ + i)); // Clip to negative limit before the subtraction of |max_val| to avoid // wrap-around with min-int values. data = _mm256_max_epi32(data, clip_val); __m256i difference = _mm256_sub_epi32(data, max_val); // Exponent trick exp. // Multiply by |log_factor|, keeping only the lower 32 bits. difference = _mm256_mullo_epi32(difference, log_factor); // Add the constant. difference = _mm256_add_epi32(difference, constant); // Reinterpret the results as float32. __m256 float_exp = _mm256_castsi256_ps(difference); // Sum the results and save to scratch space. _mm256_store_ps(scratch_ptr + i, float_exp); sum1 = _mm256_add_ps(sum1, float_exp); } // Horizontally add the 8 values in sum. // Get the top lane down to the bottom. __m256 sum2 = _mm256_permute2f128_ps(sum1, sum1, 1); sum1 = _mm256_add_ps(sum1, sum2); sum1 = _mm256_hadd_ps(sum1, sum1); sum1 = _mm256_hadd_ps(sum1, sum1); return _mm256_cvtss_f32(sum1); } // Binary search for the index where the cumulative sum meets random_target. inline void FindSamplePoint(const float* scratch_ptr, float* random_target, int* start, int* end) { int halfsize = (*end - *start) / 2; do { // Sum the first half. // We sum the section in two independent parts, so we can step down 2 // levels if we get a hit in this half. int quartersize = halfsize / (2 * kSIMDWidth); quartersize *= kSIMDWidth; halfsize = quartersize * 2; // The sums of the quarters. __m256 sum1 = _mm256_setzero_ps(); __m256 sum2 = _mm256_setzero_ps(); const float* ptr1 = scratch_ptr + *start; const float* ptr2 = ptr1 + quartersize; for (int i = 0; i < quartersize; i += kSIMDWidth) { __m256 data1 = _mm256_load_ps(ptr1 + i); __m256 data2 = _mm256_load_ps(ptr2 + i); sum1 = _mm256_add_ps(sum1, data1); sum2 = _mm256_add_ps(sum2, data2); } // Horizontally add the two sums, keeping the results separate. // Numbering |sum1|=[0-7] and |sum2|=[8-15]... sum1 = _mm256_hadd_ps(sum1, sum2); // |sum1| now has [0+1, 2+3, 8+9, 10+11, 4+5, 6+7, 12+13, 14+15]. // Bring the top lane down to the bottom. sum2 = _mm256_permute2f128_ps(sum1, sum1, 1); sum1 = _mm256_hadd_ps(sum1, sum2); // Now |sum1| has [0-3, 8-11, 4-7, 12-15], so swap the middle two // elements. sum1 = _mm256_shuffle_ps(sum1, sum1, 0xd8); sum1 = _mm256_hadd_ps(sum1, sum1); // Now |sum1| has [0-7, 8-15, ....]. float bottom_quarter = _mm256_cvtss_f32(sum1); if (bottom_quarter >= *random_target) { *end = *start + quartersize; } else { float bottom_half = _mm256_cvtss_f32(_mm256_hadd_ps(sum1, sum1)); if (bottom_half >= *random_target) { *start += quartersize; *end = *start + quartersize; *random_target -= bottom_quarter; } else { *start += halfsize; *random_target -= bottom_half; } } halfsize = (*end - *start) / 2; } while (halfsize >= kSIMDWidth * 2); } #endif // __AVX2__ code // Fixed32 version. template typename std::enable_if::value, int>::type ThreadMax( int tid) const { int t_start = thread_starts_[tid]; int t_end = thread_starts_[tid + 1]; #if defined __AVX2__ return ThreadMax(t_start, t_end); #else // With operator<, could use std::max_element. int max_value = data_[t_start].raw_val(); for (int i = t_start + 1; i < t_end; ++i) { max_value = std::max(max_value, data_[i].raw_val()); } return max_value; #endif } // As Sample above, except that if |tid| and |barrier| are provided, it will // save some time by running a local max in each thread before combining them // and doing the rest of the work duplicated across all threads. // Fixed32 version. template typename std::enable_if::value, int>::type ReducingSample( std::minstd_rand* gen, CacheAlignedVector* scratch, int tid = 0, float temperature = 1.0f, SpinBarrier* barrier = nullptr) { if (barrier != nullptr) barrier->barrier(); // Sample only accepts tid of 0, as it would ignore it anyway. // All threads duplicate the same work in this path. return Sample(temperature, gen, scratch, /*tid=*/0); } template typename std::enable_if::value, int>::type ReducingSample( std::minstd_rand* gen, CacheAlignedVector* scratch, int tid = 0, float temperature = 1.0f, SpinBarrier* barrier = nullptr) { int max_value; if (barrier == nullptr) { // There is only one thread. max_value = ThreadMax(tid); } else { // Reduce max using the threads to do some of the work. maxes_[tid] = ThreadMax(tid); barrier->barrier(); // The rest of the work is duplicated by all threads. max_value = *std::max_element(maxes_.begin(), maxes_.end()); } float* scratch_ptr = scratch->data(); std::uniform_real_distribution dist; float sum = 0.0f; #if defined __AVX2__ sum = ApplyExpAndSum::value>(max_value, scratch_ptr); #else int clip_limit = max_value - (80 << MantissaBitsOf::value); for (int i = 0; i < size_; ++i) { int difference = std::max(data_[i].raw_val(), clip_limit) - max_value; float exponent = expf(static_cast(DataType(difference))); scratch_ptr[i] = exponent; sum += exponent; } #endif // __AVX2__ float random_target = dist(*gen) * sum; int start = 0; int end = size_; #if defined __AVX2__ FindSamplePoint(scratch_ptr, &random_target, &start, &end); // The scalar code finishes the job from here... #endif // __AVX2__ float cumsum = 0.f; for (std::size_t i = start; i < end; i++) { cumsum += scratch_ptr[i]; if (cumsum >= random_target) return i; } return end - 1; } template typename std::enable_if::value, void>::type Exp() { #if defined __aarch64__ DCHECK(size_ % 16 == 0) << "CacheAlignedVector size must be a multiple of " "16 to allow for maximum SIMD and loop unroll " "got " << size_ % 16; constexpr int kUnrollFactor = 4; constexpr int kElementsPerIter = kUnrollFactor * kSIMDWidth; for (std::size_t i = 0; i < size_; i += kElementsPerIter) { float32x4_t x = vld1q_f32(data_ + i); float32x4_t x1 = vld1q_f32(data_ + i + 4); float32x4_t x2 = vld1q_f32(data_ + i + 8); float32x4_t x3 = vld1q_f32(data_ + i + 12); vst1q_f32(data_ + i, fast_exp(x)); vst1q_f32(data_ + i + 4, fast_exp(x1)); vst1q_f32(data_ + i + 8, fast_exp(x2)); vst1q_f32(data_ + i + 12, fast_exp(x3)); } #else for (int i = 0; i < size_; ++i) { data_[i] = expf(data_[i]); } #endif // defined __aarch64__ } template typename std::enable_if::value, void>::type Sigmoid() { #if defined __aarch64__ DCHECK(size_ % 8 == 0) << "CacheAlignedVector size must be a multiple of " "8 to allow for maximum SIMD and loop unroll " "got " << size_ % 8; constexpr int kUnrollFactor = 2; constexpr int kElementsPerIter = kUnrollFactor * kSIMDWidth; for (std::size_t i = 0; i < size_; i += kElementsPerIter) { float32x4_t x = vld1q_f32(data_ + i); float32x4_t x1 = vld1q_f32(data_ + i + 4); vst1q_f32(data_ + i, fast_sigmoid(x)); vst1q_f32(data_ + i + 4, fast_sigmoid(x1)); } #else for (int i = 0; i < size_; ++i) { data_[i] = 1.f / (1.f + expf(-data_[i])); } #endif // defined __aarch64__ } template typename std::enable_if< IsFixed32Type::value && IsFixed32Type::value, void>::type // For benchmarking only. Sigmoid(const int32_t* sigmoid_table, CacheAlignedVector* result) { #if defined __AVX2__ for (int i = 0; i < size_; i += kSIMDWidth) { __m256i x_in = _mm256_loadu_si256(reinterpret_cast<__m256i*>(data_ + i)); __m256i output = fixed32_sigmoid_fixed16::value, MantissaBitsOf::value>( sigmoid_table, x_in); _mm256_store_si256(reinterpret_cast<__m256i*>(result->data() + i), output); } #else for (int i = 0; i < size_; ++i) { result->data()[i] = 1.f / (1.f + expf(-data_[i])); } #endif // defined __AVX2__ } template typename std::enable_if::value, void>::type Tanh() { #if defined __aarch64__ DCHECK(size_ % 8 == 0) << "CacheAlignedVector size must be a multiple of " "8 to allow for maximum SIMD and loop unroll " "got " << size_ % 8; constexpr int kUnrollFactor = 2; constexpr int kElementsPerIter = kUnrollFactor * kSIMDWidth; for (std::size_t i = 0; i < size_; i += kElementsPerIter) { float32x4_t x = vld1q_f32(data_ + i); float32x4_t x1 = vld1q_f32(data_ + i + 4); vst1q_f32(data_ + i, fast_tanh(x)); vst1q_f32(data_ + i + 4, fast_tanh(x1)); } #else for (int i = 0; i < size_; ++i) { data_[i] = tanhf(data_[i]); } #endif // defined __aarch64__ } template typename std::enable_if< IsFixed32Type::value && IsFixed32Type::value, void>::type // For benchmarking only Tanh(const int32_t* tanh_table, CacheAlignedVector* result) { #if defined __AVX2__ for (int i = 0; i < size_; i += kSIMDWidth) { __m256i x_in = _mm256_loadu_si256(reinterpret_cast<__m256i*>(data_ + i)); __m256i output = fixed32_tanh_fixed16::value, MantissaBitsOf::value>(tanh_table, x_in); _mm256_store_si256(reinterpret_cast<__m256i*>(result->data() + i), output); } #else for (int i = 0; i < size_; ++i) { result->data()[i] = tanhf(data_[i]); } #endif // defined __AVX2__ } // Returns |data_| cast to the correct integer type if fixed point. template typename std::enable_if::value, const int32_t*>::type cast_data() const { return reinterpret_cast(data_); } template typename std::enable_if::value, const int16_t*>::type cast_data() const { return reinterpret_cast(data_); } template typename std::enable_if::value || IsFixed16Type::value), const Q*>::type cast_data() const { return data_; } const DataType* begin() const { return data_; } const DataType* end() const { return data_ + size_; } const DataType* data() const { return data_; } DataType* data() { return data_; } const DataType& operator[](int pos) const { return data_[pos]; } DataType& operator[](int pos) { return data_[pos]; } std::size_t size() const { return size_; } bool empty() const { return size_ == 0; } std::size_t bytes() const { return size_ * sizeof(DataType); } int rows() const { return size_; } int cols() const { return 1; } // Stride to get to move over by one column (which is the number of rows). int col_stride() const { return size_; } void Print() const { for (int i = 0; i < size(); ++i) absl::PrintF("[%d]=%g\n", i, static_cast(data_[i])); } float maximum() const { float max_val = std::numeric_limits::lowest(); for (int i = 0; i < size_; ++i) { max_val = std::max(max_val, std::abs(static_cast(data_[i]))); } return max_val; } private: void resize(std::size_t size) { aligned_free(data_); size_ = size; data_ = reinterpret_cast( aligned_malloc(size_ * sizeof(DataType), kCacheLineSize)); } std::size_t size_; DataType* data_; // Data used by the threaded version for sampling only. std::vector maxes_; // Max value of logits. std::vector thread_starts_; // First index for this thread. #if defined __AVX__ || defined __AVX2__ static constexpr int kCacheLineSize = 64; static constexpr int kSIMDWidth = 8; #else static constexpr int kCacheLineSize = 128; static constexpr int kSIMDWidth = 4; #endif // __AVX__ std::unique_ptr gen_; }; // Used for doing Sparse Matrix * Dense Matrix multiplication. This class is // not intended to be a general Matrix class, just for the RHS of a SpMM, hence // the name fat vector rather than Matrix. The data layout is COLUMN MAJOR. template class FatCacheAlignedVector { public: using value_type = T; FatCacheAlignedVector() : rows_(0), cols_(0) {} // Creates a new vector that is (rows, cols), doesn't init memory. FatCacheAlignedVector(int rows, int cols) : vector_(rows * cols), rows_(rows), cols_(cols) {} // Copies and reshapes vector from (1, size) to (|rows|, size / |rows|). FatCacheAlignedVector(const CacheAlignedVector& vector, int rows) : vector_(vector), rows_(rows) { CHECK_EQ(vector_.size() % rows_, 0); cols_ = vector_.size() / rows_; } template explicit FatCacheAlignedVector(const FatCacheAlignedVector& vector) : vector_(vector.size()), rows_(vector.rows()), cols_(vector.cols()) { for (int i = 0; i < vector.size(); ++i) { vector_[i] = static_cast(vector[i]); } } // Moves and reshapes vector from (1, size) to (|rows|, size / |rows|) FatCacheAlignedVector(CacheAlignedVector&& vector, int rows) : vector_(vector), rows_(rows) { CHECK_EQ(vector_.size() % rows_, 0); cols_ = vector_.size() / rows_; } VectorView slice(const int col) const { return VectorView(this->data() + rows() * col, rows(), 1); } MutableVectorView slice(const int col) { return MutableVectorView(this->data() + rows() * col, rows(), 1); } const T* data() const { return vector_.data(); } T* data() { return vector_.data(); } // Returns |data_| cast to the correct integer type if fixed point. template typename std::enable_if::value, const int32_t*>::type cast_data() const { return vector_.cast_data(); } template typename std::enable_if::value, const int16_t*>::type cast_data() const { return vector_.cast_data(); } template typename std::enable_if::value || IsFixed16Type::value), const Q*>::type cast_data() const { return vector_.cast_data(); } int rows() const { return rows_; } int cols() const { return cols_; } int size() const { return rows_ * cols_; } bool empty() const { return rows_ == 0 || cols_ == 0; } std::size_t bytes() const { return vector_.bytes(); } void reshape(int rows, int cols) { CHECK_EQ(rows * cols, rows_ * cols_); rows_ = rows; cols_ = cols; } float maximum() const { return vector_.maximum(); } // Stride to get to move over by one column (which is the number of rows). int col_stride() const { return rows_; } void FillOnes() { vector_.FillOnes(); } void FillZero() { vector_.FillZero(); } void FillRandom(float min = -10.f, float max = 10.f) { vector_.FillRandom(min, max); } const T& operator[](int pos) const { return vector_[pos]; } T& operator[](int pos) { return vector_[pos]; } private: CacheAlignedVector vector_; int rows_; int cols_; }; // View into a 2D Matrix. Currently only supports partitions by row. This is // expected to be used with underlying data that is COLUMN MAJOR. template class MutableVectorView { public: using value_type = T; // Construct from a raw pointer, |rows|, |cols| and |col_stride|. // |col_stride| will default to |rows| if not specified. explicit MutableVectorView(T* data = nullptr, int rows = 0, int cols = 0, int col_stride = 0) : data_(data), rows_(rows), cols_(cols), col_stride_(col_stride > 0 ? col_stride : rows) {} // Construct from a CacheAlignedVector, must have one column, can optionally // specify an offset and row count. explicit MutableVectorView(CacheAlignedVector* vector) : MutableVectorView(vector->data(), vector->rows(), 1) {} explicit MutableVectorView(CacheAlignedVector* vector, int pos = 0, int rows = 0) : MutableVectorView(vector->data() + pos, rows == 0 ? vector->rows() - pos : rows, 1, vector->rows()) {} // Construct from a FatCacheAlignedVector, can optionally specify an offset, // and row count. Views that have fewer columns than the original are not // supported. explicit MutableVectorView(FatCacheAlignedVector* vector) : MutableVectorView(vector->data(), vector->rows(), vector->cols()) {} MutableVectorView(FatCacheAlignedVector* vector, int pos, int rows) : MutableVectorView(vector->data() + pos, rows, vector->cols(), vector->rows()) {} T* data() { return data_; } const T* data() const { return data_; } // Returns |data_| cast to the correct integer type if fixed point. template typename std::enable_if::value, const int32_t*>::type cast_data() const { return reinterpret_cast(data_); } template typename std::enable_if::value, const int16_t*>::type cast_data() const { return reinterpret_cast(data_); } template typename std::enable_if::value || IsFixed16Type::value), const Q*>::type cast_data() const { return data_; } // Number of columns in the underlying (Fat)CacheAlignedVector. int cols() const { return cols_; } // Number of rows in this view. int rows() const { return rows_; } // Returns true if there's nothing in the MutableVectorView. bool empty() const { return rows_ == 0 || cols_ == 0; } // Stride to get to the next column (usually the number of rows in the // underlying data structure). int col_stride() const { return col_stride_; } // Returns the total number of bytes that are "owned" by this view. Uses // cols and not col_stride. std::size_t bytes() const { return rows_ * cols_ * sizeof(T); } void reshape(int rows, int cols) { CHECK_EQ(rows * cols, rows_ * cols_); rows_ = rows; cols_ = cols; col_stride_ = rows_; } const T& operator[](int pos) const { return data_[pos]; } T& operator[](int pos) { return data_[pos]; } protected: T* data_; int rows_; int cols_; int col_stride_; }; // Specialization of MutableVectorView which is read-only. template class VectorView : public MutableVectorView { public: using value_type = T; explicit VectorView(const MutableVectorView& other) : MutableVectorView(other.data(), other.rows(), other.cols(), other.col_stride()) {} // Construct from a raw pointer, |rows|, |cols| and |col_stride|. // |col_stride| will default to |rows| if not specified. explicit VectorView(const T* data = nullptr, int rows = 0, int cols = 0, int col_stride = 0) : MutableVectorView(data, rows, cols, col_stride) {} // Construct from a CacheAlignedVector, must have one column, can optionally // specify an offset and row count explicit VectorView(const CacheAlignedVector& vector) : MutableVectorView(vector.data(), vector.rows(), 1) {} explicit VectorView(const CacheAlignedVector& vector, int pos = 0, int rows = 0) : MutableVectorView(vector.data() + pos, rows == 0 ? vector.rows() - pos : rows, 1, vector.rows()) {} // Construct from a FatCacheAlignedVector, can optionally specify an offset, // and row count. Views that have fewer columns than the original are not // supported. explicit VectorView(const FatCacheAlignedVector& vector) : MutableVectorView(vector.data(), vector.rows(), vector.cols()) {} VectorView(const FatCacheAlignedVector& vector, int pos, int rows) : MutableVectorView(vector.data() + pos, rows, vector.cols(), vector.rows()) {} VectorView& operator=(const MutableVectorView& other) { this->data_ = other.data(); this->rows_ = other.rows(); this->cols_ = other.cols(); this->col_stride_ = other.col_stride(); return *this; } }; } // namespace csrblocksparse #endif // LYRA_CODEC_SPARSE_MATMUL_VECTOR_CACHE_ALIGNED_VECTOR_H_