Spaces:

gradio
/

HuBERT

Runtime error

HuBERT / fairseq /clib /cuda /ngram_repeat_block_cuda_kernel.cu

aliabd

full working demo

d5175d3 over 3 years ago

2.69 kB

	/*
	Copyright (c) Microsoft Corporation.
	Licensed under the MIT License.
	*/

	/*
	Kernel implementation for blocking repeated n-grams.
	*/

	#include <cuda.h>
	#include <cuda_runtime.h>
	#include <math.h>
	#include <torch/extension.h>
	#include <vector>

	// Ban repeated ngrams of length = 'no_repeat_ngram_size'
	__global__ void banRepeatedTokens(long* __restrict__ tokens,
	float* __restrict__ lprobs,
	int max_predict_len, int vocab_size,
	int no_repeat_ngram_size) {
	auto row = blockIdx.x;
	auto col = threadIdx.x;
	auto start = row * (max_predict_len) + col;
	// Each thread compares ngram starting from
	// thread index with final ngram starting from
	// step - no_repeat_ngram_size +2
	auto check_start_pos = blockDim.x;
	auto lprob_start = row * vocab_size;
	bool is_banned = true;
	extern __shared__ long tokens_shm[];
	tokens_shm[col] = tokens[start];
	if (col == blockDim.x - 1) {
	for (int i=1; i<no_repeat_ngram_size; i++){
	if (col+i < max_predict_len){
	tokens_shm[col + i] = tokens[start + i];
	}
	}
	}
	__syncthreads();

	for (int k = 0; k < no_repeat_ngram_size - 1; k++) {
	if (tokens_shm[col + k] != tokens_shm[check_start_pos + k]) {
	is_banned = false;
	}
	}
	if (is_banned == true) {
	auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
	lprobs[lprob_start + token_to_be_banned] = -INFINITY;
	}
	}

	// Allocate blocks and threads based on
	// batch size and sequence length and launch
	// kernel
	torch::Tensor ngram_repeat_block_cuda_forward(const torch::Tensor tokens,
	torch::Tensor lprobs, int bsz,
	int step, int beam_size,
	int no_repeat_ngram_size) {
	int threads = step - no_repeat_ngram_size + 2;
	if (threads <= 0) return lprobs;
	int max_predict_len = tokens.size(1);
	int vocab_size = lprobs.size(1);
	auto token_ptr = tokens.data_ptr<long>();
	auto lprob_ptr = lprobs.data_ptr<float>();
	int blocks = bsz * beam_size;
	int shared_mem_size = (step + 1) * sizeof(long);

	// Launching N blocks where N is number of samples in a batch (beams*bsz)
	// Launching T threads where T is number of previous ngrams in a sample
	// Allocating shared mem per block for fastser access of input tokens since
	// each token will be accessed N times to compare with current Ngram where
	// N is Ngram size.
	banRepeatedTokens<<<blocks, threads, shared_mem_size>>>(
	token_ptr, lprob_ptr, max_predict_len, vocab_size, no_repeat_ngram_size);
	return lprobs;
	}