Spaces:

NCTCMumbai
/

NCTC

Sleeping

App Files Files Community

NCTC / models /research /lm_commonsense /utils.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

11.1 kB

	# Copyright 2017 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================


	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import json
	import os
	import numpy as np
	import tensorflow as tf

	FLAGS = tf.flags.FLAGS


	class Vocabulary(object):
	"""Class that holds a vocabulary for the dataset."""

	def __init__(self, filename):

	self._id_to_word = []
	self._word_to_id = {}
	self._unk = -1
	self._bos = -1
	self._eos = -1

	with tf.gfile.Open(filename) as f:
	idx = 0
	for line in f:
	word_name = line.strip()
	if word_name == '<S>':
	self._bos = idx
	elif word_name == '</S>':
	self._eos = idx
	elif word_name == '<UNK>':
	self._unk = idx
	if word_name == '!!!MAXTERMID':
	continue

	self._id_to_word.append(word_name)
	self._word_to_id[word_name] = idx
	idx += 1

	@property
	def bos(self):
	return self._bos

	@property
	def eos(self):
	return self._eos

	@property
	def unk(self):
	return self._unk

	@property
	def size(self):
	return len(self._id_to_word)

	def word_to_id(self, word):
	if word in self._word_to_id:
	return self._word_to_id[word]
	else:
	if word.lower() in self._word_to_id:
	return self._word_to_id[word.lower()]
	return self.unk

	def id_to_word(self, cur_id):
	if cur_id < self.size:
	return self._id_to_word[int(cur_id)]
	return '<ERROR_out_of_vocab_id>'

	def decode(self, cur_ids):
	return ' '.join([self.id_to_word(cur_id) for cur_id in cur_ids])

	def encode(self, sentence):
	word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split()]
	return np.array([self.bos] + word_ids + [self.eos], dtype=np.int32)


	class CharsVocabulary(Vocabulary):
	"""Vocabulary containing character-level information."""

	def __init__(self, filename, max_word_length):
	super(CharsVocabulary, self).__init__(filename)

	self._max_word_length = max_word_length
	chars_set = set()

	for word in self._id_to_word:
	chars_set \|= set(word)

	free_ids = []
	for i in range(256):
	if chr(i) in chars_set:
	continue
	free_ids.append(chr(i))

	if len(free_ids) < 5:
	raise ValueError('Not enough free char ids: %d' % len(free_ids))

	self.bos_char = free_ids[0] # <begin sentence>
	self.eos_char = free_ids[1] # <end sentence>
	self.bow_char = free_ids[2] # <begin word>
	self.eow_char = free_ids[3] # <end word>
	self.pad_char = free_ids[4] # <padding>

	chars_set \|= {self.bos_char, self.eos_char, self.bow_char, self.eow_char,
	self.pad_char}

	self._char_set = chars_set
	num_words = len(self._id_to_word)

	self._word_char_ids = np.zeros([num_words, max_word_length], dtype=np.int32)

	self.bos_chars = self._convert_word_to_char_ids(self.bos_char)
	self.eos_chars = self._convert_word_to_char_ids(self.eos_char)

	for i, word in enumerate(self._id_to_word):
	if i == self.bos:
	self._word_char_ids[i] = self.bos_chars
	elif i == self.eos:
	self._word_char_ids[i] = self.eos_chars
	else:
	self._word_char_ids[i] = self._convert_word_to_char_ids(word)

	@property
	def max_word_length(self):
	return self._max_word_length

	def _convert_word_to_char_ids(self, word):
	code = np.zeros([self.max_word_length], dtype=np.int32)
	code[:] = ord(self.pad_char)

	if len(word) > self.max_word_length - 2:
	word = word[:self.max_word_length-2]
	cur_word = self.bow_char + word + self.eow_char
	for j in range(len(cur_word)):
	code[j] = ord(cur_word[j])
	return code

	def word_to_char_ids(self, word):
	if word in self._word_to_id:
	return self._word_char_ids[self._word_to_id[word]]
	else:
	return self._convert_word_to_char_ids(word)

	def encode_chars(self, sentence):
	chars_ids = [self.word_to_char_ids(cur_word)
	for cur_word in sentence.split()]
	return np.vstack([self.bos_chars] + chars_ids + [self.eos_chars])


	_SPECIAL_CHAR_MAP = {
	'\xe2\x80\x98': '\'',
	'\xe2\x80\x99': '\'',
	'\xe2\x80\x9c': '"',
	'\xe2\x80\x9d': '"',
	'\xe2\x80\x93': '-',
	'\xe2\x80\x94': '-',
	'\xe2\x88\x92': '-',
	'\xce\x84': '\'',
	'\xc2\xb4': '\'',
	'`': '\''
	}

	_START_SPECIAL_CHARS = ['.', ',', '?', '!', ';', ':', '[', ']', '\'', '+', '/',
	'\xc2\xa3', '$', '~', '*', '%', '{', '}', '#', '&', '-',
	'"', '(', ')', '='] + list(_SPECIAL_CHAR_MAP.keys())
	_SPECIAL_CHARS = _START_SPECIAL_CHARS + [
	'\'s', '\'m', '\'t', '\'re', '\'d', '\'ve', '\'ll']


	def tokenize(sentence):
	"""Tokenize a sentence."""
	sentence = str(sentence)
	words = sentence.strip().split()
	tokenized = [] # return this

	for word in words:
	if word.lower() in ['mr.', 'ms.']:
	tokenized.append(word)
	continue

	# Split special chars at the start of word
	will_split = True
	while will_split:
	will_split = False
	for char in _START_SPECIAL_CHARS:
	if word.startswith(char):
	tokenized.append(char)
	word = word[len(char):]
	will_split = True

	# Split special chars at the end of word
	special_end_tokens = []
	will_split = True
	while will_split:
	will_split = False
	for char in _SPECIAL_CHARS:
	if word.endswith(char):
	special_end_tokens = [char] + special_end_tokens
	word = word[:-len(char)]
	will_split = True

	if word:
	tokenized.append(word)
	tokenized += special_end_tokens

	# Add necessary end of sentence token.
	if tokenized[-1] not in ['.', '!', '?']:
	tokenized += ['.']
	return tokenized


	def parse_commonsense_reasoning_test(test_data_name):
	"""Read JSON test data."""
	with tf.gfile.Open(os.path.join(
	FLAGS.data_dir, 'commonsense_test',
	'{}.json'.format(test_data_name)), 'r') as f:
	data = json.load(f)

	question_ids = [d['question_id'] for d in data]
	sentences = [tokenize(d['substitution']) for d in data]
	labels = [d['correctness'] for d in data]

	return question_ids, sentences, labels


	PAD = '<padding>'


	def cut_to_patches(sentences, batch_size, num_timesteps):
	"""Cut sentences into patches of shape (batch_size, num_timesteps).

	Args:
	sentences: a list of sentences, each sentence is a list of str token.
	batch_size: batch size
	num_timesteps: number of backprop step

	Returns:
	patches: A 2D matrix,
	each entry is a matrix of shape (batch_size, num_timesteps).
	"""
	preprocessed = [['<S>']+sentence+['</S>'] for sentence in sentences]
	max_len = max([len(sent) for sent in preprocessed])

	# Pad to shape [height, width]
	# where height is a multiple of batch_size
	# and width is a multiple of num_timesteps
	nrow = int(np.ceil(len(preprocessed) * 1.0 / batch_size))
	ncol = int(np.ceil(max_len * 1.0 / num_timesteps))
	height, width = nrow * batch_size, ncol * num_timesteps + 1
	preprocessed = [sent + [PAD] * (width - len(sent)) for sent in preprocessed]
	preprocessed += [[PAD] * width] * (height - len(preprocessed))

	# Cut preprocessed into patches of shape [batch_size, num_timesteps]
	patches = []
	for row in range(nrow):
	patches.append([])
	for col in range(ncol):
	patch = [sent[col * num_timesteps:
	(col+1) * num_timesteps + 1]
	for sent in preprocessed[row * batch_size:
	(row+1) * batch_size]]
	if np.all(np.array(patch)[:, 1:] == PAD):
	patch = None # no need to process this patch.
	patches[-1].append(patch)
	return patches


	def _substitution_mask(sent1, sent2):
	"""Binary mask identifying substituted part in two sentences.

	Example sentence and their mask:
	First sentence = "I like the cat 's color"
	0 0 0 1 0 0
	Second sentence = "I like the yellow dog 's color"
	0 0 0 1 1 0 0

	Args:
	sent1: first sentence
	sent2: second sentence

	Returns:
	mask1: mask for first sentence
	mask2: mask for second sentence
	"""
	mask1_start, mask2_start = [], []
	while sent1[0] == sent2[0]:
	sent1 = sent1[1:]
	sent2 = sent2[1:]
	mask1_start.append(0.)
	mask2_start.append(0.)

	mask1_end, mask2_end = [], []
	while sent1[-1] == sent2[-1]:
	if (len(sent1) == 1) or (len(sent2) == 1):
	break
	sent1 = sent1[:-1]
	sent2 = sent2[:-1]
	mask1_end = [0.] + mask1_end
	mask2_end = [0.] + mask2_end

	assert sent1 or sent2, 'Two sentences are identical.'
	return (mask1_start + [1.] * len(sent1) + mask1_end,
	mask2_start + [1.] * len(sent2) + mask2_end)


	def _convert_to_partial(scoring1, scoring2):
	"""Convert full scoring into partial scoring."""
	mask1, mask2 = _substitution_mask(
	scoring1['sentence'], scoring2['sentence'])

	def _partial_score(scoring, mask):
	word_probs = [max(_) for _ in zip(scoring['word_probs'], mask)]
	scoring.update(word_probs=word_probs,
	joint_prob=np.prod(word_probs))

	_partial_score(scoring1, mask1)
	_partial_score(scoring2, mask2)


	def compare_substitutions(question_ids, scorings, mode='full'):
	"""Return accuracy by comparing two consecutive scorings."""
	prediction_correctness = []
	# Compare two consecutive substitutions
	for i in range(len(scorings) // 2):
	scoring1, scoring2 = scorings[2i: 2i+2]
	if mode == 'partial': # fix joint prob into partial prob
	_convert_to_partial(scoring1, scoring2)

	prediction_correctness.append(
	(scoring2['joint_prob'] > scoring1['joint_prob']) ==
	scoring2['correctness'])

	# Two consecutive substitutions always belong to the same question
	question_ids = [qid for i, qid in enumerate(question_ids) if i % 2 == 0]
	assert len(question_ids) == len(prediction_correctness)
	num_questions = len(set(question_ids))

	# Question is correctly answered only if
	# all predictions of the same question_id is correct
	num_correct_answer = 0
	previous_qid = None
	correctly_answered = False
	for predict, qid in zip(prediction_correctness, question_ids):
	if qid != previous_qid:
	previous_qid = qid
	num_correct_answer += int(correctly_answered)
	correctly_answered = True
	correctly_answered = correctly_answered and predict
	num_correct_answer += int(correctly_answered)

	return num_correct_answer / num_questions