envibert / envibert_tokenizer.py

Update envibert_tokenizer.py

f9b0bf1 almost 3 years ago

11 kB

	# !pip install sentencepiece==0.1.96 transformers==4.10.0
	import sentencepiece as spm
	import os
	from transformers import PreTrainedTokenizer
	from collections import Counter
	from typing import List, Optional, Tuple


	class RobertaTokenizer(PreTrainedTokenizer):
	def __init__(
	self,
	pretrained_file,
	bos_token="<s>",
	eos_token="</s>",
	sep_token="</s>",
	cls_token="<s>",
	unk_token="<unk>",
	pad_token="<pad>",
	mask_token="<mask>",
	**kwargs
	):
	super().__init__(
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	sep_token=sep_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	**kwargs,
	)

	# load bpe model and vocab file
	sentencepiece_model = os.path.join(pretrained_file, 'sentencepiece.bpe.model')
	vocab_file = os.path.join(pretrained_file, 'dict.txt')
	self.sp_model = spm.SentencePieceProcessor()
	self.sp_model.Load(
	sentencepiece_model) # please dont use anything from sp_model bcz it makes everything goes wrong

	self.bpe_dict = Dictionary().load(vocab_file)

	# Mimic fairseq token-to-id alignment for the first 4 token
	self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}

	# The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
	self.fairseq_offset = 0

	self.fairseq_tokens_to_ids["<mask>"] = len(self.bpe_dict) + self.fairseq_offset
	self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}

	def _tokenize(self, text):
	return self.sp_model.EncodeAsPieces(text)

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	#TODO
	return "", ""

	def _convert_token_to_id(self, token):
	""" Converts a token (str) in an id using the vocab. """
	if token in self.fairseq_tokens_to_ids:
	return self.fairseq_tokens_to_ids[token]
	spm_id = self.bpe_dict.index(token)
	return spm_id

	def _convert_id_to_token(self, index):
	"""Converts an index (integer) in a token (str) using the vocab."""
	if index in self.fairseq_ids_to_tokens:
	return self.fairseq_ids_to_tokens[index]
	return self.bpe_dict[index]

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
	adding special tokens.

	This implementation does not add special tokens and this method should be overridden in a subclass.

	Args:
	token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
	token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.

	Returns:
	:obj:`List[int]`: The model input with special tokens.
	"""
	return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
	not make use of token type ids, therefore a list of zeros is returned.

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.

	Returns:
	:obj:`List[int]`: List of zeros.

	"""

	sep = [self.sep_token_id]
	cls = [self.cls_token_id]

	return len(cls + token_ids_0 + sep) * [0]

	@property
	def vocab_size(self):
	return len(self.bpe_dict) + self.fairseq_offset + 1 # Add the <mask> token

	def get_vocab(self):
	vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
	vocab.update(self.added_tokens_encoder)
	return vocab


	class Dictionary(object):
	"""A mapping from symbols to consecutive integers"""

	def __init__(
	self,
	pad='<pad>',
	eos='</s>',
	unk='<unk>',
	bos='<s>',
	extra_special_symbols=None,
	):
	self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
	self.symbols = []
	self.count = []
	self.indices = {}
	self.bos_index = self.add_symbol(bos)
	self.pad_index = self.add_symbol(pad)
	self.eos_index = self.add_symbol(eos)
	self.unk_index = self.add_symbol(unk)
	if extra_special_symbols:
	for s in extra_special_symbols:
	self.add_symbol(s)
	self.nspecial = len(self.symbols)

	def __eq__(self, other):
	return self.indices == other.indices

	def __getitem__(self, idx):
	if idx < len(self.symbols):
	return self.symbols[idx]
	return self.unk_word

	def __len__(self):
	"""Returns the number of symbols in the dictionary"""
	return len(self.symbols)

	def __contains__(self, sym):
	return sym in self.indices

	def index(self, sym):
	"""Returns the index of the specified symbol"""
	assert isinstance(sym, str)
	if sym in self.indices:
	return self.indices[sym]
	return self.unk_index

	def unk_string(self, escape=False):
	"""Return unknown string, optionally escaped as: <<unk>>"""
	if escape:
	return '<{}>'.format(self.unk_word)
	else:
	return self.unk_word

	def add_symbol(self, word, n=1):
	"""Adds a word to the dictionary"""
	if word in self.indices:
	idx = self.indices[word]
	self.count[idx] = self.count[idx] + n
	return idx
	else:
	idx = len(self.symbols)
	self.indices[word] = idx
	self.symbols.append(word)
	self.count.append(n)
	return idx

	def update(self, new_dict):
	"""Updates counts from new dictionary."""
	for word in new_dict.symbols:
	idx2 = new_dict.indices[word]
	if word in self.indices:
	idx = self.indices[word]
	self.count[idx] = self.count[idx] + new_dict.count[idx2]
	else:
	idx = len(self.symbols)
	self.indices[word] = idx
	self.symbols.append(word)
	self.count.append(new_dict.count[idx2])

	def finalize(self, threshold=-1, nwords=-1, padding_factor=8):
	"""Sort symbols by frequency in descending order, ignoring special ones.

	Args:
	- threshold defines the minimum word count
	- nwords defines the total number of words in the final dictionary,
	including special symbols
	- padding_factor can be used to pad the dictionary size to be a
	multiple of 8, which is important on some hardware (e.g., Nvidia
	Tensor Cores).
	"""
	if nwords <= 0:
	nwords = len(self)

	new_indices = dict(zip(self.symbols[:self.nspecial], range(self.nspecial)))
	new_symbols = self.symbols[:self.nspecial]
	new_count = self.count[:self.nspecial]

	c = Counter(dict(sorted(zip(self.symbols[self.nspecial:], self.count[self.nspecial:]))))
	for symbol, count in c.most_common(nwords - self.nspecial):
	if count >= threshold:
	new_indices[symbol] = len(new_symbols)
	new_symbols.append(symbol)
	new_count.append(count)
	else:
	break

	threshold_nwords = len(new_symbols)
	if padding_factor > 1:
	i = 0
	while threshold_nwords % padding_factor != 0:
	symbol = 'madeupword{:04d}'.format(i)
	new_indices[symbol] = len(new_symbols)
	new_symbols.append(symbol)
	new_count.append(0)
	i += 1
	threshold_nwords += 1

	assert len(new_symbols) % padding_factor == 0
	assert len(new_symbols) == len(new_indices)

	self.count = list(new_count)
	self.symbols = list(new_symbols)
	self.indices = new_indices

	def bos(self):
	"""Helper to get index of beginning-of-sentence symbol"""
	return self.bos_index

	def pad(self):
	"""Helper to get index of pad symbol"""
	return self.pad_index

	def eos(self):
	"""Helper to get index of end-of-sentence symbol"""
	return self.eos_index

	def unk(self):
	"""Helper to get index of unk symbol"""
	return self.unk_index

	@classmethod
	def load(cls, f):
	"""Loads the dictionary from a text file with the format:

	```
	<symbol0> <count0>
	<symbol1> <count1>
	...
	```
	"""
	d = cls()
	d.add_from_file(f)
	return d

	def add_from_file(self, f):
	"""
	Loads a pre-existing dictionary from a text file and adds its symbols
	to this instance.
	"""
	if isinstance(f, str):
	try:
	with open(f, 'r', encoding='utf-8') as fd:
	self.add_from_file(fd)
	except FileNotFoundError as fnfe:
	raise fnfe
	except UnicodeError:
	raise Exception("Incorrect encoding detected in {}, please "
	"rebuild the dataset".format(f))
	return

	lines = f.readlines()
	indices_start_line = self._load_meta(lines)
	for line in lines[indices_start_line:]:
	idx = line.rfind(' ')
	if idx == -1:
	raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
	word = line[:idx]
	count = int(line[idx + 1:])
	self.indices[word] = len(self.symbols)
	self.symbols.append(word)
	self.count.append(count)

	def _save(self, f, kv_iterator):
	if isinstance(f, str):
	os.makedirs(os.path.dirname(f), exist_ok=True)
	with open(f, 'w', encoding='utf-8') as fd:
	return self.save(fd)
	for k, v in kv_iterator:
	print('{} {}'.format(k, v), file=f)

	def _get_meta(self):
	return [], []

	def _load_meta(self, lines):
	return 0

	def save(self, f):
	"""Stores dictionary into a text file"""
	ex_keys, ex_vals = self._get_meta()
	self._save(f, zip(ex_keys + self.symbols[self.nspecial:], ex_vals + self.count[self.nspecial:]))