Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /lexnet_nc /lexnet_model.py

NCTCMumbai

Upload 2571 files

0b8359d over 1 year ago

raw

history blame

15.5 kB

	# Copyright 2017, 2018 Google, Inc. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""The integrated LexNET model."""
	from __future__ import absolute_import
	from __future__ import division
	from __future__ import print_function

	import lexnet_common
	import numpy as np
	import tensorflow as tf
	from six.moves import xrange


	class LexNETModel(object):
	"""The LexNET model for classifying relationships between noun compounds."""

	@classmethod
	def default_hparams(cls):
	"""Returns the default hyper-parameters."""
	return tf.contrib.training.HParams(
	batch_size=10,
	num_classes=37,
	num_epochs=30,
	input_keep_prob=0.9,
	input='integrated', # dist/ dist-nc/ path/ integrated/ integrated-nc
	learn_relata=False,
	corpus='wiki_gigawords',
	random_seed=133, # zero means no random seed
	relata_embeddings_file='glove/glove.6B.300d.bin',
	nc_embeddings_file='nc_glove/vecs.6B.300d.bin',
	path_embeddings_file='path_embeddings/tratz/fine_grained/wiki',
	hidden_layers=1,
	path_dim=60)

	def __init__(self, hparams, relata_embeddings, path_embeddings, nc_embeddings,
	path_to_index):
	"""Initialize the LexNET classifier.

	Args:
	hparams: the hyper-parameters.
	relata_embeddings: word embeddings for the distributional component.
	path_embeddings: embeddings for the paths.
	nc_embeddings: noun compound embeddings.
	path_to_index: a mapping from string path to an index in the path
	embeddings matrix.
	"""
	self.hparams = hparams

	self.path_embeddings = path_embeddings
	self.relata_embeddings = relata_embeddings
	self.nc_embeddings = nc_embeddings

	self.vocab_size, self.relata_dim = 0, 0
	self.path_to_index = None
	self.path_dim = 0

	# Set the random seed
	if hparams.random_seed > 0:
	tf.set_random_seed(hparams.random_seed)

	# Get the vocabulary size and relata dim
	if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
	self.vocab_size, self.relata_dim = self.relata_embeddings.shape

	# Create the mapping from string path to an index in the embeddings matrix
	if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
	self.path_to_index = tf.contrib.lookup.HashTable(
	tf.contrib.lookup.KeyValueTensorInitializer(
	tf.constant(path_to_index.keys()),
	tf.constant(path_to_index.values()),
	key_dtype=tf.string, value_dtype=tf.int32), 0)

	self.path_dim = self.path_embeddings.shape[1]

	# Create the network
	self.__create_computation_graph__()

	def __create_computation_graph__(self):
	"""Initialize the model and define the graph."""
	network_input = 0

	# Define the network inputs
	# Distributional x and y
	if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
	network_input += 2 * self.relata_dim
	self.relata_lookup = tf.get_variable(
	'relata_lookup',
	initializer=self.relata_embeddings,
	dtype=tf.float32,
	trainable=self.hparams.learn_relata)

	# Path-based
	if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
	network_input += self.path_dim

	self.path_initial_value_t = tf.placeholder(tf.float32, None)

	self.path_lookup = tf.get_variable(
	name='path_lookup',
	dtype=tf.float32,
	trainable=False,
	shape=self.path_embeddings.shape)

	self.initialize_path_op = tf.assign(
	self.path_lookup, self.path_initial_value_t, validate_shape=False)

	# Distributional noun compound
	if self.hparams.input in ['dist-nc', 'integrated-nc']:
	network_input += self.relata_dim

	self.nc_initial_value_t = tf.placeholder(tf.float32, None)

	self.nc_lookup = tf.get_variable(
	name='nc_lookup',
	dtype=tf.float32,
	trainable=False,
	shape=self.nc_embeddings.shape)

	self.initialize_nc_op = tf.assign(
	self.nc_lookup, self.nc_initial_value_t, validate_shape=False)

	hidden_dim = network_input // 2

	# Define the MLP
	if self.hparams.hidden_layers == 0:
	self.weights1 = tf.get_variable(
	'W1',
	shape=[network_input, self.hparams.num_classes],
	dtype=tf.float32)
	self.bias1 = tf.get_variable(
	'b1',
	shape=[self.hparams.num_classes],
	dtype=tf.float32)

	elif self.hparams.hidden_layers == 1:

	self.weights1 = tf.get_variable(
	'W1',
	shape=[network_input, hidden_dim],
	dtype=tf.float32)
	self.bias1 = tf.get_variable(
	'b1',
	shape=[hidden_dim],
	dtype=tf.float32)

	self.weights2 = tf.get_variable(
	'W2',
	shape=[hidden_dim, self.hparams.num_classes],
	dtype=tf.float32)
	self.bias2 = tf.get_variable(
	'b2',
	shape=[self.hparams.num_classes],
	dtype=tf.float32)

	else:
	raise ValueError('Only 0 or 1 hidden layers are supported')

	# Define the variables
	self.instances = tf.placeholder(dtype=tf.string,
	shape=[self.hparams.batch_size])

	(self.x_embedding_id,
	self.y_embedding_id,
	self.nc_embedding_id,
	self.path_embedding_id,
	self.path_counts,
	self.labels) = parse_tensorflow_examples(
	self.instances, self.hparams.batch_size, self.path_to_index)

	# Create the MLP
	self.__mlp__()

	self.instances_to_load = tf.placeholder(dtype=tf.string, shape=[None])
	self.labels_to_load = lexnet_common.load_all_labels(self.instances_to_load)
	self.pairs_to_load = lexnet_common.load_all_pairs(self.instances_to_load)

	def load_labels(self, session, instances):
	"""Loads the labels for these instances.

	Args:
	session: The current TensorFlow session,
	instances: The instances for which to load the labels.

	Returns:
	the labels of these instances.
	"""
	return session.run(self.labels_to_load,
	feed_dict={self.instances_to_load: instances})

	def load_pairs(self, session, instances):
	"""Loads the word pairs for these instances.

	Args:
	session: The current TensorFlow session,
	instances: The instances for which to load the labels.

	Returns:
	the word pairs of these instances.
	"""
	word_pairs = session.run(self.pairs_to_load,
	feed_dict={self.instances_to_load: instances})
	return [pair[0].split('::') for pair in word_pairs]

	def __train_single_batch__(self, session, batch_instances):
	"""Train a single batch.

	Args:
	session: The current TensorFlow session.
	batch_instances: TensorFlow examples containing the training intances

	Returns:
	The cost for the current batch.
	"""
	cost, _ = session.run([self.cost, self.train_op],
	feed_dict={self.instances: batch_instances})

	return cost

	def fit(self, session, inputs, on_epoch_completed, val_instances, val_labels,
	save_path):
	"""Train the model.

	Args:
	session: The current TensorFlow session.
	inputs:
	on_epoch_completed: A method to call after each epoch.
	val_instances: The validation set instances (evaluation between epochs).
	val_labels: The validation set labels (for evaluation between epochs).
	save_path: Where to save the model.
	"""
	for epoch in range(self.hparams.num_epochs):

	losses = []
	epoch_indices = list(np.random.permutation(len(inputs)))

	# If the number of instances doesn't divide by batch_size, enlarge it
	# by duplicating training examples
	mod = len(epoch_indices) % self.hparams.batch_size
	if mod > 0:
	epoch_indices.extend([np.random.randint(0, high=len(inputs))] * mod)

	# Define the batches
	n_batches = len(epoch_indices) // self.hparams.batch_size

	for minibatch in range(n_batches):

	batch_indices = epoch_indices[minibatch * self.hparams.batch_size:(
	minibatch + 1) * self.hparams.batch_size]
	batch_instances = [inputs[i] for i in batch_indices]

	loss = self.__train_single_batch__(session, batch_instances)
	losses.append(loss)

	epoch_loss = np.nanmean(losses)

	if on_epoch_completed:
	should_stop = on_epoch_completed(self, session, epoch, epoch_loss,
	val_instances, val_labels, save_path)
	if should_stop:
	print('Stopping training after %d epochs.' % epoch)
	return

	def predict(self, session, inputs):
	"""Predict the classification of the test set.

	Args:
	session: The current TensorFlow session.
	inputs: the train paths, x, y and/or nc vectors

	Returns:
	The test predictions.
	"""
	predictions, _ = zip(*self.predict_with_score(session, inputs))
	return np.array(predictions)

	def predict_with_score(self, session, inputs):
	"""Predict the classification of the test set.

	Args:
	session: The current TensorFlow session.
	inputs: the test paths, x, y and/or nc vectors

	Returns:
	The test predictions along with their scores.
	"""
	test_pred = [0] * len(inputs)

	for chunk in xrange(0, len(test_pred), self.hparams.batch_size):

	# Initialize the variables with the current batch data
	batch_indices = list(
	range(chunk, min(chunk + self.hparams.batch_size, len(test_pred))))

	# If the batch is too small, add a few other examples
	if len(batch_indices) < self.hparams.batch_size:
	batch_indices += [0] * (self.hparams.batch_size-len(batch_indices))

	batch_instances = [inputs[i] for i in batch_indices]

	predictions, scores = session.run(
	[self.predictions, self.scores],
	feed_dict={self.instances: batch_instances})

	for index_in_batch, index_in_dataset in enumerate(batch_indices):
	prediction = predictions[index_in_batch]
	score = scores[index_in_batch][prediction]
	test_pred[index_in_dataset] = (prediction, score)

	return test_pred

	def __mlp__(self):
	"""Performs the MLP operations.

	Returns: the prediction object to be computed in a Session
	"""
	# Define the operations

	# Network input
	vec_inputs = []

	# Distributional component
	if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
	for emb_id in [self.x_embedding_id, self.y_embedding_id]:
	vec_inputs.append(tf.nn.embedding_lookup(self.relata_lookup, emb_id))

	# Noun compound component
	if self.hparams.input in ['dist-nc', 'integrated-nc']:
	vec = tf.nn.embedding_lookup(self.nc_lookup, self.nc_embedding_id)
	vec_inputs.append(vec)

	# Path-based component
	if self.hparams.input in ['path', 'integrated', 'integrated-nc']:

	# Get the current paths for each batch instance
	self.path_embeddings = tf.nn.embedding_lookup(self.path_lookup,
	self.path_embedding_id)

	# self.path_embeddings is of shape
	# [batch_size, max_path_per_instance, output_dim]
	# We need to multiply it by path counts
	# ([batch_size, max_path_per_instance]).
	# Start by duplicating path_counts along the output_dim axis.
	self.path_freq = tf.tile(tf.expand_dims(self.path_counts, -1),
	[1, 1, self.path_dim])

	# Compute the averaged path vector for each instance.
	# First, multiply the path embeddings and frequencies element-wise.
	self.weighted = tf.multiply(self.path_freq, self.path_embeddings)

	# Second, take the sum to get a tensor of shape [batch_size, output_dim].
	self.pair_path_embeddings = tf.reduce_sum(self.weighted, 1)

	# Finally, divide by the total number of paths.
	# The number of paths for each pair has a shape [batch_size, 1],
	# We duplicate it output_dim times along the second axis.
	self.num_paths = tf.clip_by_value(
	tf.reduce_sum(self.path_counts, 1), 1, np.inf)
	self.num_paths = tf.tile(tf.expand_dims(self.num_paths, -1),
	[1, self.path_dim])

	# And finally, divide pair_path_embeddings by num_paths element-wise.
	self.pair_path_embeddings = tf.div(
	self.pair_path_embeddings, self.num_paths)
	vec_inputs.append(self.pair_path_embeddings)

	# Concatenate the inputs and feed to the MLP
	self.input_vec = tf.nn.dropout(
	tf.concat(vec_inputs, 1),
	keep_prob=self.hparams.input_keep_prob)

	h = tf.matmul(self.input_vec, self.weights1)
	self.output = h

	if self.hparams.hidden_layers == 1:
	self.output = tf.matmul(tf.nn.tanh(h), self.weights2)

	self.scores = self.output
	self.predictions = tf.argmax(self.scores, axis=1)

	# Define the loss function and the optimization algorithm
	self.cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits(
	logits=self.scores, labels=self.labels)
	self.cost = tf.reduce_sum(self.cross_entropies, name='cost')
	self.global_step = tf.Variable(0, name='global_step', trainable=False)
	self.optimizer = tf.train.AdamOptimizer()
	self.train_op = self.optimizer.minimize(
	self.cost, global_step=self.global_step)


	def parse_tensorflow_examples(record, batch_size, path_to_index):
	"""Reads TensorFlow examples from a RecordReader.

	Args:
	record: a record with TensorFlow examples.
	batch_size: the number of instances in a minibatch
	path_to_index: mapping from string path to index in the embeddings matrix.

	Returns:
	The word embeddings IDs, paths and counts
	"""
	features = tf.parse_example(
	record, {
	'x_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
	'y_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
	'nc_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
	'reprs': tf.FixedLenSequenceFeature(
	shape=(), dtype=tf.string, allow_missing=True),
	'counts': tf.FixedLenSequenceFeature(
	shape=(), dtype=tf.int64, allow_missing=True),
	'rel_id': tf.FixedLenFeature([1], dtype=tf.int64)
	})

	x_embedding_id = tf.squeeze(features['x_embedding_id'], [-1])
	y_embedding_id = tf.squeeze(features['y_embedding_id'], [-1])
	nc_embedding_id = tf.squeeze(features['nc_embedding_id'], [-1])
	labels = tf.squeeze(features['rel_id'], [-1])
	path_counts = tf.to_float(tf.reshape(features['counts'], [batch_size, -1]))

	path_embedding_id = None
	if path_to_index:
	path_embedding_id = path_to_index.lookup(features['reprs'])

	return (
	x_embedding_id, y_embedding_id, nc_embedding_id,
	path_embedding_id, path_counts, labels)