haotiz's picture
initial commit
708dec4
raw
history blame
3.28 kB
"""
Language-related data loading helper functions and class wrappers.
"""
import re
import torch
import codecs
UNK_TOKEN = '<unk>'
PAD_TOKEN = '<pad>'
END_TOKEN = '<eos>'
SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
class Dictionary(object):
def __init__(self):
self.word2idx = {}
self.idx2word = []
def add_word(self, word):
if word not in self.word2idx:
self.idx2word.append(word)
self.word2idx[word] = len(self.idx2word) - 1
return self.word2idx[word]
def __len__(self):
return len(self.idx2word)
def __getitem__(self, a):
if isinstance(a, int):
return self.idx2word[a]
elif isinstance(a, list):
return [self.idx2word[x] for x in a]
elif isinstance(a, str):
return self.word2idx[a]
else:
raise TypeError("Query word/index argument must be int or str")
def __contains__(self, word):
return word in self.word2idx
class Corpus(object):
def __init__(self):
self.dictionary = Dictionary()
def set_max_len(self, value):
self.max_len = value
def load_file(self, filename):
with codecs.open(filename, 'r', 'utf-8') as f:
for line in f:
line = line.strip()
self.add_to_corpus(line)
self.dictionary.add_word(UNK_TOKEN)
self.dictionary.add_word(PAD_TOKEN)
def add_to_corpus(self, line):
"""Tokenizes a text line."""
# Add words to the dictionary
words = line.split()
# tokens = len(words)
for word in words:
word = word.lower()
self.dictionary.add_word(word)
def tokenize(self, line, max_len=20):
# Tokenize line contents
words = SENTENCE_SPLIT_REGEX.split(line.strip())
# words = [w.lower() for w in words if len(w) > 0]
words = [w.lower() for w in words if (len(w) > 0 and w != ' ')] ## do not include space as a token
if words[-1] == '.':
words = words[:-1]
if max_len > 0:
if len(words) > max_len:
words = words[:max_len]
elif len(words) < max_len:
# words = [PAD_TOKEN] * (max_len - len(words)) + words
words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
tokens = len(words) ## for end token
ids = torch.LongTensor(tokens)
token = 0
for word in words:
if word not in self.dictionary:
word = UNK_TOKEN
# print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
if type(word) != type('a'):
print(word, type(word), word.encode('ascii', 'ignore').decode('ascii'),
type(word.encode('ascii', 'ignore').decode('ascii')))
word = word.encode('ascii', 'ignore').decode('ascii')
ids[token] = self.dictionary[word]
token += 1
# ids[token] = self.dictionary[END_TOKEN]
return ids
def __len__(self):
return len(self.dictionary)