Dusan's picture
Update fudge/data.py
6526b0f
raw
history blame
25.9 kB
import random
import math
import os
import pickle
from collections import defaultdict, namedtuple
import string
os.environ['TOKENIZERS_PARALLELISM'] = 'false' # turn off since we're using multiple threads for loading anyway
from transformers import AutoTokenizer, AutoModelWithLMHead, pipeline, set_seed, GPT2Tokenizer, GPT2Model
import numpy as np
from tqdm import tqdm
import torch
from fudge.util import suppress_stdout
from fudge.poetry_util import is_iambic, count_syllables, get_rhymes, get_rhyme_group
from fudge.constants import *
DatasetInfo = namedtuple('DatasetInfo',
['index2word', 'word2index', 'total_words', 'vocab', 'glove_embeddings'])
RhymeInfo = namedtuple('RhymeInfo',
['word2rhyme_group', 'rhyme_group_counts', 'rhyme_groups', 'index2rhyme_group', 'rhyme_group2index', 'total_rhyme_groups'])
def collate(batch):
pad_id = batch[0][4]
inputs = [b[0] for b in batch]
lengths = torch.LongTensor([b[1] for b in batch])
max_length = lengths.max()
for i in range(len(inputs)):
if len(inputs[i]) < max_length:
inputs[i] = torch.cat([inputs[i], torch.zeros(max_length - len(inputs[i])).long()], dim=0) # actually 0 is fine as pad since it's masked out
inputs = torch.stack(inputs, dim=0)
future_words = torch.LongTensor([b[2] for b in batch]).unsqueeze(0).expand(len(batch), -1).clone() # batch x N=batch
labels = torch.zeros_like(future_words).long()
labels = labels.scatter(1, torch.arange(len(batch)).unsqueeze(1), torch.ones(len(batch)).long().unsqueeze(1)).clone()
log_probs = torch.Tensor([b[3] for b in batch])
classification_labels = [b[5] for b in batch] # batch
if type(classification_labels[0]) == list:
for i in range(len(classification_labels)):
assert len(classification_labels[i]) == lengths[i]
if len(classification_labels[i]) < max_length:
classification_labels[i] = torch.cat([torch.LongTensor(classification_labels[i]), -1 + torch.zeros(max_length - len(classification_labels[i])).long()], dim=0)
else:
classification_labels[i] = torch.LongTensor(classification_labels[i])
classification_labels = torch.stack(classification_labels, dim=0) # batch x seq
else:
assert type(classification_labels[0]) == int
classification_labels = torch.LongTensor(classification_labels) # they're just int labels
syllables_to_go = torch.LongTensor([b[6] for b in batch])
future_word_num_syllables = torch.LongTensor([b[7] for b in batch])
rhyme_group_index = torch.LongTensor([b[8] for b in batch])
return (inputs, lengths, future_words, log_probs, labels, classification_labels, syllables_to_go, future_word_num_syllables, rhyme_group_index)
def load_rhyme_info(index2word, vocab):
word2rhyme_group = defaultdict(lambda: UNKNOWN_RHYME_GROUP)
rhyme_group_counts = defaultdict(lambda: 0)
rhyme_groups = set()
for word in index2word:
try:
rhyme_group = get_rhyme_group(word)
word2rhyme_group[word] = rhyme_group
rhyme_group_counts[rhyme_group] += (vocab[word] if word in vocab else 1) # for rare words not in vocab, just use 1
rhyme_groups.add(rhyme_group)
except:
rhyme_group_counts[UNKNOWN_RHYME_GROUP] += (vocab[word] if word in vocab else 1)
index2rhyme_group = [UNKNOWN_RHYME_GROUP] + sorted(list(rhyme_groups))
rhyme_group2index = {s: i for i, s in enumerate(index2rhyme_group)}
total_rhyme_groups = sum(rhyme_group_counts.values())
return RhymeInfo(word2rhyme_group=dict(word2rhyme_group),
rhyme_group_counts=dict(rhyme_group_counts),
rhyme_groups=rhyme_groups,
index2rhyme_group=index2rhyme_group,
rhyme_group2index=rhyme_group2index,
total_rhyme_groups=total_rhyme_groups)
class Dataset:
def __init__(self, args):
print('loading data')
random.seed(args.seed)
self.batch_size = args.batch_size
self.data_dir = args.data_dir
self.topic = args.task == 'topic'
self.formality = args.task == 'formality'
self.iambic = args.task == 'iambic'
self.rhyme = args.task == 'rhyme'
self.newline = args.task == 'newline'
self.tokenizer = AutoTokenizer.from_pretrained(FORMALITY_MODEL_STRING if self.formality else TOPIC_MODEL_STRING)
self.tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})
self.gpt_pad_id = self.tokenizer.encode(PAD_TOKEN)[0] # actually just the vocab size
sentences = []
self.vocab = defaultdict(lambda: 0)
if self.formality:
self.vocab['placeholder'] = 1 # anything so we don't crash
train, val, test = [], [], []
for category, label in [('formal', 1), ('informal', 0)]:
with open(os.path.join(args.data_dir, 'train', category), 'r') as rf:
for i, line in enumerate(rf):
if len(line) > FORMALITY_MAX_LEN:
line = ' '.join(line.strip()[:FORMALITY_MAX_LEN].split()[:-1]) # cutoff words until below max len; chosen so only ~20 examples affected in dataset
if i < FORMALITY_VAL_SIZE // 2:
val.append((line.strip(), label))
else:
train.append((line.strip(), label))
with open(os.path.join(args.data_dir, 'test', category), 'r') as rf:
for line in rf:
if len(line) > FORMALITY_MAX_LEN:
line = ' '.join(line.strip()[:FORMALITY_MAX_LEN].split()[:-1]) # cutoff words until below max len
test.append((line.strip(), label))
self.splits = {}
self.splits['train'], self.splits['val'], self.splits['test'] = train, val, test
else: # topic / poetry
for root, _, filenames in os.walk(args.data_dir):
for fname in filenames:
with open(os.path.join(root, fname), 'r') as rf:
for line in rf:
sentences.append(line.strip())
for word in line.strip().split(' '):
self.vocab[word] += 1
random.shuffle(sentences)
self.splits = {}
if args.debug:
self.splits['val'] = sentences
self.splits['test'] = sentences
self.splits['train'] = sentences
else:
self.splits['val'] = sentences[:TOPIC_VAL_SIZE]
self.splits['test'] = sentences[TOPIC_VAL_SIZE:2*TOPIC_VAL_SIZE]
self.splits['train'] = sentences[2*TOPIC_VAL_SIZE:]
if args.dataset_info is not None:
print('loading dataset info from file')
with open(args.dataset_info, 'rb') as rf:
dataset_info = pickle.load(rf)
self.vocab, self.total_words, self.index2word, self.word2index, self.glove_embeddings = \
dataset_info.vocab, dataset_info.total_words, dataset_info.index2word, dataset_info.word2index, dataset_info.glove_embeddings
self.dataset_info = dataset_info
else:
print('generating dataset info from scratch')
words_values = list(self.vocab.items())
words_values = sorted(words_values, key=lambda x: x[1], reverse=True)
if args.glove_file is None:
print('no glove embeddings given')
for word, _ in words_values[VOCAB_SIZE:]: # only use somewhat common tokens
del self.vocab[word]
glove_embeddings = None
else:
print('loading glove embeddings')
glove_embeddings = {}
with open(args.glove_file, 'r') as rf:
for i, line in enumerate(rf):
if i % GLOVE_PRINT_PROGRESS_FREQ == 0:
print(i)
line = line.strip().split()
if len(line) != GLOVE_DIM + 1:
continue # skip multi-word embeddings which are rare anyway
glove_embeddings[line[0]] = [float(x) for x in line[1:]]
for word, _ in words_values:
if word not in glove_embeddings:
del self.vocab[word]
self.total_words = sum(self.vocab.values())
self.index2word = [PAD_TOKEN] + sorted(list(self.vocab.keys()))
self.word2index = {s: i for i, s in enumerate(self.index2word)}
self.vocab = dict(self.vocab) # so we can pickle later
if glove_embeddings is None:
self.glove_embeddings = None
else:
self.glove_embeddings = torch.stack([torch.zeros(GLOVE_DIM)] + [torch.Tensor(glove_embeddings[word]) for word in self.index2word[1:]], dim=0)
self.dataset_info = DatasetInfo(index2word=self.index2word,
word2index=self.word2index,
total_words=self.total_words,
vocab=self.vocab,
glove_embeddings=self.glove_embeddings)
if self.rhyme:
if args.rhyme_info is not None:
print('loading rhyme info from file')
with open(args.rhyme_info, 'rb') as rf:
self.rhyme_info = pickle.load(rf)
else:
self.rhyme_info = load_rhyme_info(self.index2word, self.vocab)
self.word2rhyme_group, self.rhyme_group_counts, self.rhyme_groups, self.index2rhyme_group, self.rhyme_group2index, self.total_rhyme_groups = \
defaultdict(lambda: UNKNOWN_RHYME_GROUP, self.rhyme_info.word2rhyme_group), self.rhyme_info.rhyme_group_counts, self.rhyme_info.rhyme_groups, self.rhyme_info.index2rhyme_group, self.rhyme_info.rhyme_group2index, self.rhyme_info.total_rhyme_groups
print('done loading data')
print('split sizes:')
for key in ['train', 'val', 'test']:
print(key, len(self.splits[key]))
if not self.formality:
print('total words', self.total_words)
print('vocab size', len(self.index2word))
def shuffle(self, split, seed=None):
assert split in ['train', 'val', 'test']
if seed is not None:
random.seed(seed)
random.shuffle(self.splits[split])
def loader(self, split, num_workers=20, indices=None):
assert split in ['train', 'val', 'test']
data = self.splits[split] if indices is None else [self.splits[split][i] for i in indices]
return torch.utils.data.DataLoader(SplitLoader(data, self), batch_size=self.batch_size, pin_memory=True, collate_fn=collate, num_workers=num_workers)
class SplitLoader(torch.utils.data.IterableDataset):
def __init__(self, data, parent):
super(SplitLoader).__init__()
self.data = data
self.pos = 0
self.parent = parent
def __len__(self):
return len(self.data)
def __iter__(self):
return self
def __next__(self):
increment = 1
worker_info = torch.utils.data.get_worker_info()
if worker_info is not None: # # in a worker process
increment = worker_info.num_workers
worker_id = worker_info.id
if self.pos == 0:
self.pos = worker_id
valid = False
while not valid:
if self.pos >= len(self):
raise StopIteration
if self.parent.topic:
failed = False
future_word_num_syllables, rhyme_group_index, syllables_to_go = -1, -1, -1
raw_sentence, classification_label = self.data[self.pos], -1
original_sentence = raw_sentence.split()
sentence = self.parent.tokenizer.encode(raw_sentence, return_tensors='pt')[0]
length = len(sentence)
min_sentence_length = MIN_SENTENCE_LENGTH
if len(sentence) > min_sentence_length: # set to 3. well, everything in data is > 3 for the bag of words task
pos_to_split = random.randint(1, length - 1) # for lm, learn all positions at once
inp = sentence[:pos_to_split]
length = len(inp)
num_words_in_input = len(self.parent.tokenizer.decode(inp).split())
if not failed and num_words_in_input < len(original_sentence):
future_word_position_max = len(original_sentence) - 1
future_word_position = random.randint(num_words_in_input-1, future_word_position_max) # allow the last possibly partial word though
future_word = original_sentence[future_word_position]
unstripped_future_word = future_word
future_word = future_word.strip().strip(string.punctuation) # NOTE: we didn't strip punctuation for the topic bag of words paper experiments for our method. it doesn't make much difference, though.
if not failed and future_word in self.parent.word2index.keys():
word_log_prob = math.log(self.parent.vocab[future_word] / self.parent.total_words) # roughly baseline prob of word under noise model
future_word = self.parent.word2index[future_word]
pad_id = self.parent.gpt_pad_id
example = (inp, length, future_word, word_log_prob, pad_id, classification_label, syllables_to_go, future_word_num_syllables, rhyme_group_index)
valid = not failed
elif self.parent.formality:
future_word_num_syllables, rhyme_group_index, syllables_to_go = -1, -1, -1
raw_sentence, classification_label = self.data[self.pos]
original_sentence = raw_sentence.split()
sentence = self.parent.tokenizer.encode(raw_sentence, return_tensors='pt')[0]
length = len(sentence)
min_sentence_length = MIN_SENTENCE_LENGTH
if len(sentence) > min_sentence_length: # set to 3. well, everything in data is > 3 for the bag of words task
pos_to_split = length # no need to split; we're going to train on all possible prefixes simultaneously for efficiency
inp = sentence[:pos_to_split]
length = len(inp)
num_words_in_input = len(self.parent.tokenizer.decode(inp).split())
# only look up to 10 words ahead if we're doing count syllables, since we'll filter out anything more than 10 syllables ahead anyway
future_word_position_max = len(original_sentence) - 1
future_word_position = 0
future_word = 'placeholder'
unstripped_future_word = future_word
future_word = future_word.strip().strip(string.punctuation) # NOTE: we didn't strip punctuation for the topic bag of words paper experiments for our method. it doesn't make much difference, though.
word_log_prob, future_word = 0, 0
pad_id = self.parent.gpt_pad_id
example = (inp, length, future_word, word_log_prob, pad_id, classification_label, syllables_to_go, future_word_num_syllables, rhyme_group_index)
valid = True
elif self.parent.iambic:
failed = False
future_word_num_syllables, rhyme_group_index, syllables_to_go = -1, -1, -1
raw_sentence, classification_label = self.data[self.pos], -1
original_sentence = raw_sentence.split()
sentence = self.parent.tokenizer.encode(raw_sentence, return_tensors='pt')[0]
length = len(sentence)
min_sentence_length = MIN_SENTENCE_LENGTH
if len(sentence) > min_sentence_length: # set to 3. well, everything in data is > 3 for the bag of words task
pos_to_split = random.randint(0, length - 1)
# try to get a subseq of exactly 10 syllables
inp = sentence[pos_to_split:]
num_syllables = 0
checked = False
for i in range(1, len(inp)):
decoded = self.parent.tokenizer.decode(inp[:i])
num_syllables = count_syllables(decoded)
if num_syllables > POETRY_LINE_SYLLABLES:
inp = inp[:i-1] # might get a few data points where the split is in the middle of a word, but it should be ok for learning.
last_line_length = i-1
decoded = self.parent.tokenizer.decode(inp)
num_syllables = count_syllables(decoded)
checked = True
break
if not checked or num_syllables != POETRY_LINE_SYLLABLES:
failed = True
length = len(inp)
num_words_in_input = len(self.parent.tokenizer.decode(inp).split())
classification_label = [is_iambic(self.parent.tokenizer.decode(inp)) for _ in range(length)] # predict for whole seq including future
# only look up to 10 words ahead if we're doing count syllables, since we'll filter out anything more than 10 syllables ahead anyway
future_word_position_max = len(original_sentence) - 1
future_word_position = 0
future_word = 'placeholder'
unstripped_future_word = future_word
future_word = future_word.strip().strip(string.punctuation) # NOTE: we didn't strip punctuation for the topic bag of words paper experiments for our method. it doesn't make much difference, though.
if not failed:
word_log_prob, future_word = 0, 0
pad_id = self.parent.gpt_pad_id
example = (inp, length, future_word, word_log_prob, pad_id, classification_label, syllables_to_go, future_word_num_syllables, rhyme_group_index)
valid = not failed
elif self.parent.rhyme:
failed = False
future_word_num_syllables, rhyme_group_index = -1, -1
raw_sentence, classification_label = self.data[self.pos], -1
original_sentence = raw_sentence.split()
sentence = self.parent.tokenizer.encode(raw_sentence, return_tensors='pt')[0]
length = len(sentence)
min_sentence_length = MIN_SENTENCE_LENGTH
if len(sentence) > min_sentence_length: # set to 3. well, everything in data is > 3 for the bag of words task
pos_to_split = random.randint(1, length - 1) # for lm, learn all positions at once
inp = sentence[:pos_to_split]
length = len(inp)
num_words_in_input = len(self.parent.tokenizer.decode(inp).split())
if not failed and num_words_in_input < len(original_sentence):
# only look up to 10 words ahead if we're doing count syllables, since we'll filter out anything more than 10 syllables ahead anyway
future_word_position_max = min(len(original_sentence) - 1, num_words_in_input + MAX_COUNT_SYLLABLE_DIST)
future_word_position = random.randint(num_words_in_input-1, future_word_position_max) # allow the last possibly partial word though
future_word = original_sentence[future_word_position]
unstripped_future_word = future_word
future_word = future_word.strip().strip(string.punctuation) # NOTE: we didn't strip punctuation for the topic bag of words paper experiments for our method. it doesn't make much difference, though.
words_in_between = original_sentence[num_words_in_input-1:future_word_position+1]
syllables_to_go = count_syllables(' '.join(words_in_between))
if syllables_to_go > MAX_COUNT_SYLLABLE_DIST:
failed = True
future_word_num_syllables = count_syllables(future_word)
rhyme_group = self.parent.word2rhyme_group[future_word]
rhyme_group_index = self.parent.rhyme_group2index[rhyme_group]
# truncate context a bit since we're just doing couplets. random length from 1 to max desired length for this purpose.
desired_length = random.randint(1, MAX_COUNT_SYLLABLE_INPUT_LENGTH)
inp = inp[-desired_length:]
length = len(inp)
if not failed and future_word in self.parent.word2index.keys():
word_log_prob = math.log(self.parent.rhyme_group_counts[rhyme_group] / self.parent.total_rhyme_groups)
future_word = rhyme_group_index # future conditioning is just the rhyme group in this case
pad_id = self.parent.gpt_pad_id
example = (inp, length, future_word, word_log_prob, pad_id, classification_label, syllables_to_go, future_word_num_syllables, rhyme_group_index)
valid = not failed
elif self.parent.newline:
failed = False
future_word_num_syllables, rhyme_group_index = -1, -1
raw_sentence, classification_label = self.data[self.pos], -1
original_sentence = raw_sentence.split()
sentence = self.parent.tokenizer.encode(raw_sentence, return_tensors='pt')[0]
length = len(sentence)
min_sentence_length = MIN_SENTENCE_LENGTH
if len(sentence) > min_sentence_length: # set to 3. well, everything in data is > 3 for the bag of words task
pos_to_split = random.randint(1, length - 1) # for lm, learn all positions at once
inp = sentence[:pos_to_split]
while pos_to_split < len(sentence):
if len(self.parent.tokenizer.decode(inp).split()) == len(self.parent.tokenizer.decode(sentence[:pos_to_split + 1]).split()):
pos_to_split += 1
inp = sentence[:pos_to_split]
else:
break
length = len(inp)
num_words_in_input = len(self.parent.tokenizer.decode(inp).split())
if not failed and num_words_in_input < len(original_sentence):
# only look up to 10 words ahead if we're doing count syllables, since we'll filter out anything more than 10 syllables ahead anyway
future_word_position_max = len(original_sentence) - 1
future_word_position = random.randint(num_words_in_input-1, future_word_position_max) # allow the last possibly partial word though
future_word = original_sentence[future_word_position]
unstripped_future_word = future_word
future_word = future_word.strip().strip(string.punctuation) # NOTE: we didn't strip punctuation for the topic bag of words paper experiments for our method. it doesn't make much difference, though.
# future_word = original_sentence[-1] # useful for debugging
words_in_between = original_sentence[num_words_in_input-1:future_word_position+1]
syllables_to_go = count_syllables(' '.join(words_in_between))
if syllables_to_go > MAX_COUNT_SYLLABLE_DIST:
failed = True
# truncate context a bit since we're just doing couplets. random length from 1 to max desired length for this purpose.
desired_length = random.randint(1, MAX_COUNT_SYLLABLE_INPUT_LENGTH)
# desired_length = 10 # useful for debugging
inp = inp[-desired_length:]
length = len(inp)
true_label = 1 if unstripped_future_word.strip()[-1] in PHRASE_ENDS else 0 # common ways to end a phrase
classification_label = [-1 for _ in range(length)]
classification_label[-1] = true_label # only learn at the last position
if not failed and future_word in self.parent.word2index.keys():
word_log_prob = math.log(self.parent.vocab[future_word] / self.parent.total_words) # roughly baseline prob of word under noise model
future_word = self.parent.word2index[future_word]
pad_id = self.parent.gpt_pad_id
example = (inp, length, future_word, word_log_prob, pad_id, classification_label, syllables_to_go, future_word_num_syllables, rhyme_group_index)
valid = not failed
else:
raise NotImplementedError
self.pos += increment
return example