/* Copyright (c) Microsoft Corporation. Licensed under the MIT License. */ /* Kernel implementation for blocking repeated n-grams. */ #include #include #include #include #include // Ban repeated ngrams of length = 'no_repeat_ngram_size' __global__ void banRepeatedTokens(long* __restrict__ tokens, float* __restrict__ lprobs, int max_predict_len, int vocab_size, int no_repeat_ngram_size) { auto row = blockIdx.x; auto col = threadIdx.x; auto start = row * (max_predict_len) + col; // Each thread compares ngram starting from // thread index with final ngram starting from // step - no_repeat_ngram_size +2 auto check_start_pos = blockDim.x; auto lprob_start = row * vocab_size; bool is_banned = true; extern __shared__ long tokens_shm[]; tokens_shm[col] = tokens[start]; if (col == blockDim.x - 1) { for (int i=1; i(); auto lprob_ptr = lprobs.data_ptr(); int blocks = bsz * beam_size; int shared_mem_size = (step + 1) * sizeof(long); // Launching N blocks where N is number of samples in a batch (beams*bsz) // Launching T threads where T is number of previous ngrams in a sample // Allocating shared mem per block for fastser access of input tokens since // each token will be accessed N times to compare with current Ngram where // N is Ngram size. banRepeatedTokens<<>>( token_ptr, lprob_ptr, max_predict_len, vocab_size, no_repeat_ngram_size); return lprobs; }