Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /lm_commonsense /eval.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

7.19 kB

	# Copyright 2017 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================


	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import os
	import pickle as pkl
	import numpy as np
	import tensorflow as tf
	import utils

	tf.app.flags.DEFINE_string(
	'data_dir', 'reproduce',
	'Path to directory containing data and model checkpoints.')


	FLAGS = tf.app.flags.FLAGS


	class EnsembleLM(object):
	"""Ensemble of language models."""

	def __init__(self, test_data_name='wsc273'):
	vocab_file = os.path.join(FLAGS.data_dir, 'vocab.txt')
	self.vocab = utils.CharsVocabulary(vocab_file, 50)
	assert test_data_name in ['pdp60', 'wsc273'], (
	'Test data must be pdp60 or wsc273, got {}'.format(test_data_name))
	self.test_data_name = test_data_name

	test_data = utils.parse_commonsense_reasoning_test(test_data_name)
	self.question_ids, self.sentences, self.labels = test_data
	self.all_probs = [] # aggregate single-model prediction here.

	def add_single_model(self, model_name='lm1'):
	"""Add a single model into the current ensemble."""
	# Create single LM
	single_lm = SingleRecurrentLanguageModel(self.vocab, model_name)

	# Add the single LM prediction.
	probs = single_lm.assign_probs(self.sentences, self.test_data_name)
	self.all_probs.append(probs)
	print('Done adding {}'.format(model_name))

	def evaluate(self):
	"""Evaluate the current ensemble."""
	# Attach word probabilities and correctness label to each substitution
	ensembled_probs = sum(self.all_probs) / len(self.all_probs)
	scorings = []
	for i, sentence in enumerate(self.sentences):
	correctness = self.labels[i]
	word_probs = ensembled_probs[i, :len(sentence)]
	joint_prob = np.prod(word_probs, dtype=np.float64)

	scorings.append(dict(
	correctness=correctness,
	sentence=sentence,
	joint_prob=joint_prob,
	word_probs=word_probs))
	scoring_mode = 'full' if self.test_data_name == 'pdp60' else 'partial'
	return utils.compare_substitutions(
	self.question_ids, scorings, scoring_mode)


	class SingleRecurrentLanguageModel(object):
	"""Single Recurrent Language Model."""

	def __init__(self, vocab, model_name='lm01'):
	self.vocab = vocab
	self.log_dir = os.path.join(FLAGS.data_dir, model_name)

	def reset(self):
	self.sess.run(self.tensors['states_init'])

	def _score(self, word_patch):
	"""Score a matrix of shape (batch_size, num_timesteps+1) str tokens."""
	word_ids = np.array(
	[[self.vocab.word_to_id(word) for word in row]
	for row in word_patch])
	char_ids = np.array(
	[[self.vocab.word_to_char_ids(word) for word in row]
	for row in word_patch])
	print('Probs for \n{}\n='.format(np.array(word_patch)[:, 1:]))

	input_ids, target_ids = word_ids[:, :-1], word_ids[:, 1:]
	input_char_ids = char_ids[:, :-1, :]

	softmax = self.sess.run(self.tensors['softmax_out'], feed_dict={
	self.tensors['inputs_in']: input_ids,
	self.tensors['char_inputs_in']: input_char_ids
	})

	batch_size, num_timesteps = self.shape
	softmax = softmax.reshape((num_timesteps, batch_size, -1))
	softmax = np.transpose(softmax, [1, 0, 2])
	probs = np.array([[softmax[row, col, target_ids[row, col]]
	for col in range(num_timesteps)]
	for row in range(batch_size)])
	print(probs)
	return probs

	def _score_patches(self, word_patches):
	"""Score a 2D matrix of word_patches and stitch results together."""
	batch_size, num_timesteps = self.shape
	nrow, ncol = len(word_patches), len(word_patches[0])
	max_len = num_timesteps * ncol
	probs = np.zeros([0, max_len]) # accumulate results into this.

	# Loop through the 2D matrix of word_patches and score each.
	for i, row in enumerate(word_patches):
	print('Reset RNN states.')
	self.reset() # reset states before processing each row.
	row_probs = np.zeros([batch_size, 0])
	for j, word_patch in enumerate(row):
	print('Processing patch '
	'({}, {}) / ({}, {})'.format(i+1, j+1, nrow, ncol))
	patch_probs = (self._score(word_patch) if word_patch else
	np.zeros([batch_size, num_timesteps]))
	row_probs = np.concatenate([row_probs, patch_probs], 1)
	probs = np.concatenate([probs, row_probs], 0)
	return probs

	def assign_probs(self, sentences, test_data_name='wsc273'):
	"""Return prediction accuracy using this LM for a test."""

	probs_cache = os.path.join(self.log_dir, '{}.probs'.format(test_data_name))
	if os.path.exists(probs_cache):
	print('Reading cached result from {}'.format(probs_cache))
	with tf.gfile.Open(probs_cache, 'r') as f:
	probs = pkl.load(f)
	else:
	tf.reset_default_graph()
	self.sess = tf.Session()
	# Build the graph.
	saver = tf.train.import_meta_graph(
	os.path.join(self.log_dir, 'ckpt-best.meta'))
	saver.restore(self.sess, os.path.join(self.log_dir, 'ckpt-best'))
	print('Restored from {}'.format(self.log_dir))
	graph = tf.get_default_graph()
	self.tensors = dict(
	inputs_in=graph.get_tensor_by_name('test_inputs_in:0'),
	char_inputs_in=graph.get_tensor_by_name('test_char_inputs_in:0'),
	softmax_out=graph.get_tensor_by_name('SotaRNN_1/softmax_out:0'),
	states_init=graph.get_operation_by_name('SotaRNN_1/states_init'))
	self.shape = self.tensors['inputs_in'].shape.as_list()

	# Cut sentences into patches of shape processable by the LM.
	batch_size, num_timesteps = self.shape
	word_patches = utils.cut_to_patches(sentences, batch_size, num_timesteps)
	probs = self._score_patches(word_patches)

	# Cache the probs since they are expensive to evaluate
	with tf.gfile.Open(probs_cache, 'w') as f:
	pkl.dump(probs, f)
	return probs


	def evaluate_ensemble(test_data_name, number_of_lms):
	ensemble = EnsembleLM(test_data_name)
	model_list = ['lm{:02d}'.format(i+1) for i in range(number_of_lms)]
	for model_name in model_list:
	ensemble.add_single_model(model_name)
	accuracy = ensemble.evaluate()
	print('Accuracy of {} LM(s) on {} = {}'.format(
	number_of_lms, test_data_name, accuracy))


	def main(_):
	evaluate_ensemble('pdp60', 1) # 60%
	evaluate_ensemble('pdp60', 5) # 70%
	evaluate_ensemble('wsc273', 10) # 61.5%
	evaluate_ensemble('wsc273', 14) # 63.7%


	if __name__ == '__main__':
	tf.app.run(main)