Spaces:
Runtime error
Runtime error
import torch | |
import nltk | |
from nltk.translate.bleu_score import SmoothingFunction | |
from tqdm import tqdm | |
def calculate_perplexity(model, tokens, prompt_len, bsz=1, marker=False): | |
""" | |
Calculate perplexity of given tokens using provided model, ignoring padding tokens. | |
Args: | |
model: Llama model | |
tokens (List[List[int]] or torch.Tensor): Input tokens (n_prompt * n_draft, seqlen) | |
prompt_len (int): Prefix length | |
bsz (int): Batch size | |
marker (bool): Whether to show progress bar | |
Returns: | |
Perplexity across all generations (n_prompt * n_drafts) | |
""" | |
it = range(0, len(tokens), bsz) | |
if marker: | |
it = tqdm(it) | |
start = 0 | |
ppl = torch.zeros(len(tokens)) | |
for start in it: | |
end = start + bsz | |
data = tokens[start : end] | |
if not isinstance(data, list): | |
data = data.tolist() | |
# Remove any padding tokens (-1) in generations | |
for d_idx in range(len(data)): | |
cur = data[d_idx] | |
if -1 in cur: | |
data[d_idx] = cur[:cur.index(-1)] | |
# Calculate cross entropy loss on tokens | |
ce_loss = model.generate(data, max_gen_len=0, temperature=-1, top_p=-1, grade=True) | |
# Cut off everything past `prompt_len` | |
ce_loss = ce_loss[:, prompt_len-1:] # Subtract 1 because the first token (start token) is removed | |
# Calculate perplexity | |
lengths = (ce_loss != 0).sum(dim=-1) | |
mean = ce_loss.sum(dim=-1) / lengths | |
ppl[start : end] = torch.exp(-1 * mean) | |
return ppl | |
def calculate_diversity(generations, k=4): | |
""" | |
Calculate diversity of generations using SELF-BLEU. | |
Args: | |
generations (List[List[List[int]]]): Tokenized input | |
k (int, Optional): Number of n-grams to use for bleu | |
Returns: | |
Average diversity across all generations (float) | |
""" | |
nltk.download('punkt') # Can be deleted once downloaded | |
smooth = SmoothingFunction() | |
bleus = [] | |
for drafts in generations: | |
tokenized_drafts = [] | |
# Stringify tokens | |
for d in drafts: | |
if -1 in d: | |
d = d[:d.index(-1)] | |
tokenized_drafts.append([str(n) for n in d]) | |
# Calculate SELF-BLEU | |
minlength = min([len(g) for g in tokenized_drafts]) | |
minlength = min(minlength, k) | |
weights = tuple((1. / minlength for _ in range(minlength))) | |
for i in range(len(drafts)): | |
# Create source and reference (all other drafts) | |
src = tokenized_drafts[i] | |
ref = tokenized_drafts[:i] + tokenized_drafts[i+1:] | |
tmp = nltk.translate.bleu_score.sentence_bleu(references=ref, | |
hypothesis=src, | |
weights=weights, | |
smoothing_function=smooth.method1) | |
bleus.append(tmp) | |
bleus = torch.Tensor(bleus) | |
return torch.mean(bleus) | |
def calculate_ngram_repetition(sequences): | |
""" | |
Calculate uniqueness scores of `sequences`. | |
Args: | |
sequences (List[List[int]]): Generated sequences | |
Returns: | |
(unigram_uniqueness, bigram_uniqueness, trigram_uniqueness) | |
""" | |
u_total = 0 | |
b_total = 0 | |
t_total = 0 | |
# Iterate through all sequences indiscriminately | |
for gen in sequences: | |
if -1 in gen: | |
gen = gen[:gen.index(-1)] | |
unigrams, bigrams, trigrams = [], [], [] | |
o = [str(i) for i in gen] | |
# Create lists of n-grams for the generation | |
for i in range(len(o)): | |
unigrams.append(o[i]) | |
for i in range(len(o) - 1): | |
bigrams.append(o[i] + '_' + o[i + 1]) | |
for i in range(len(o) - 2): | |
trigrams.append(o[i] + '_' + o[i + 1] + '_' + o[i + 2]) | |
# Calculate uniqueness of the generation | |
u, b, t = len(set(unigrams)) / len(unigrams), len(set(bigrams)) / len(bigrams), len(set(trigrams)) / len(trigrams) | |
u_total += u | |
b_total += b | |
t_total += t | |
return u_total / len(sequences), b_total / len(sequences), t_total / len(sequences) | |