NCTC / models /research /lexnet_nc /lexnet_model.py
NCTCMumbai's picture
Upload 2571 files
0b8359d
raw
history blame
15.5 kB
# Copyright 2017, 2018 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The integrated LexNET model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import lexnet_common
import numpy as np
import tensorflow as tf
from six.moves import xrange
class LexNETModel(object):
"""The LexNET model for classifying relationships between noun compounds."""
@classmethod
def default_hparams(cls):
"""Returns the default hyper-parameters."""
return tf.contrib.training.HParams(
batch_size=10,
num_classes=37,
num_epochs=30,
input_keep_prob=0.9,
input='integrated', # dist/ dist-nc/ path/ integrated/ integrated-nc
learn_relata=False,
corpus='wiki_gigawords',
random_seed=133, # zero means no random seed
relata_embeddings_file='glove/glove.6B.300d.bin',
nc_embeddings_file='nc_glove/vecs.6B.300d.bin',
path_embeddings_file='path_embeddings/tratz/fine_grained/wiki',
hidden_layers=1,
path_dim=60)
def __init__(self, hparams, relata_embeddings, path_embeddings, nc_embeddings,
path_to_index):
"""Initialize the LexNET classifier.
Args:
hparams: the hyper-parameters.
relata_embeddings: word embeddings for the distributional component.
path_embeddings: embeddings for the paths.
nc_embeddings: noun compound embeddings.
path_to_index: a mapping from string path to an index in the path
embeddings matrix.
"""
self.hparams = hparams
self.path_embeddings = path_embeddings
self.relata_embeddings = relata_embeddings
self.nc_embeddings = nc_embeddings
self.vocab_size, self.relata_dim = 0, 0
self.path_to_index = None
self.path_dim = 0
# Set the random seed
if hparams.random_seed > 0:
tf.set_random_seed(hparams.random_seed)
# Get the vocabulary size and relata dim
if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
self.vocab_size, self.relata_dim = self.relata_embeddings.shape
# Create the mapping from string path to an index in the embeddings matrix
if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
self.path_to_index = tf.contrib.lookup.HashTable(
tf.contrib.lookup.KeyValueTensorInitializer(
tf.constant(path_to_index.keys()),
tf.constant(path_to_index.values()),
key_dtype=tf.string, value_dtype=tf.int32), 0)
self.path_dim = self.path_embeddings.shape[1]
# Create the network
self.__create_computation_graph__()
def __create_computation_graph__(self):
"""Initialize the model and define the graph."""
network_input = 0
# Define the network inputs
# Distributional x and y
if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
network_input += 2 * self.relata_dim
self.relata_lookup = tf.get_variable(
'relata_lookup',
initializer=self.relata_embeddings,
dtype=tf.float32,
trainable=self.hparams.learn_relata)
# Path-based
if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
network_input += self.path_dim
self.path_initial_value_t = tf.placeholder(tf.float32, None)
self.path_lookup = tf.get_variable(
name='path_lookup',
dtype=tf.float32,
trainable=False,
shape=self.path_embeddings.shape)
self.initialize_path_op = tf.assign(
self.path_lookup, self.path_initial_value_t, validate_shape=False)
# Distributional noun compound
if self.hparams.input in ['dist-nc', 'integrated-nc']:
network_input += self.relata_dim
self.nc_initial_value_t = tf.placeholder(tf.float32, None)
self.nc_lookup = tf.get_variable(
name='nc_lookup',
dtype=tf.float32,
trainable=False,
shape=self.nc_embeddings.shape)
self.initialize_nc_op = tf.assign(
self.nc_lookup, self.nc_initial_value_t, validate_shape=False)
hidden_dim = network_input // 2
# Define the MLP
if self.hparams.hidden_layers == 0:
self.weights1 = tf.get_variable(
'W1',
shape=[network_input, self.hparams.num_classes],
dtype=tf.float32)
self.bias1 = tf.get_variable(
'b1',
shape=[self.hparams.num_classes],
dtype=tf.float32)
elif self.hparams.hidden_layers == 1:
self.weights1 = tf.get_variable(
'W1',
shape=[network_input, hidden_dim],
dtype=tf.float32)
self.bias1 = tf.get_variable(
'b1',
shape=[hidden_dim],
dtype=tf.float32)
self.weights2 = tf.get_variable(
'W2',
shape=[hidden_dim, self.hparams.num_classes],
dtype=tf.float32)
self.bias2 = tf.get_variable(
'b2',
shape=[self.hparams.num_classes],
dtype=tf.float32)
else:
raise ValueError('Only 0 or 1 hidden layers are supported')
# Define the variables
self.instances = tf.placeholder(dtype=tf.string,
shape=[self.hparams.batch_size])
(self.x_embedding_id,
self.y_embedding_id,
self.nc_embedding_id,
self.path_embedding_id,
self.path_counts,
self.labels) = parse_tensorflow_examples(
self.instances, self.hparams.batch_size, self.path_to_index)
# Create the MLP
self.__mlp__()
self.instances_to_load = tf.placeholder(dtype=tf.string, shape=[None])
self.labels_to_load = lexnet_common.load_all_labels(self.instances_to_load)
self.pairs_to_load = lexnet_common.load_all_pairs(self.instances_to_load)
def load_labels(self, session, instances):
"""Loads the labels for these instances.
Args:
session: The current TensorFlow session,
instances: The instances for which to load the labels.
Returns:
the labels of these instances.
"""
return session.run(self.labels_to_load,
feed_dict={self.instances_to_load: instances})
def load_pairs(self, session, instances):
"""Loads the word pairs for these instances.
Args:
session: The current TensorFlow session,
instances: The instances for which to load the labels.
Returns:
the word pairs of these instances.
"""
word_pairs = session.run(self.pairs_to_load,
feed_dict={self.instances_to_load: instances})
return [pair[0].split('::') for pair in word_pairs]
def __train_single_batch__(self, session, batch_instances):
"""Train a single batch.
Args:
session: The current TensorFlow session.
batch_instances: TensorFlow examples containing the training intances
Returns:
The cost for the current batch.
"""
cost, _ = session.run([self.cost, self.train_op],
feed_dict={self.instances: batch_instances})
return cost
def fit(self, session, inputs, on_epoch_completed, val_instances, val_labels,
save_path):
"""Train the model.
Args:
session: The current TensorFlow session.
inputs:
on_epoch_completed: A method to call after each epoch.
val_instances: The validation set instances (evaluation between epochs).
val_labels: The validation set labels (for evaluation between epochs).
save_path: Where to save the model.
"""
for epoch in range(self.hparams.num_epochs):
losses = []
epoch_indices = list(np.random.permutation(len(inputs)))
# If the number of instances doesn't divide by batch_size, enlarge it
# by duplicating training examples
mod = len(epoch_indices) % self.hparams.batch_size
if mod > 0:
epoch_indices.extend([np.random.randint(0, high=len(inputs))] * mod)
# Define the batches
n_batches = len(epoch_indices) // self.hparams.batch_size
for minibatch in range(n_batches):
batch_indices = epoch_indices[minibatch * self.hparams.batch_size:(
minibatch + 1) * self.hparams.batch_size]
batch_instances = [inputs[i] for i in batch_indices]
loss = self.__train_single_batch__(session, batch_instances)
losses.append(loss)
epoch_loss = np.nanmean(losses)
if on_epoch_completed:
should_stop = on_epoch_completed(self, session, epoch, epoch_loss,
val_instances, val_labels, save_path)
if should_stop:
print('Stopping training after %d epochs.' % epoch)
return
def predict(self, session, inputs):
"""Predict the classification of the test set.
Args:
session: The current TensorFlow session.
inputs: the train paths, x, y and/or nc vectors
Returns:
The test predictions.
"""
predictions, _ = zip(*self.predict_with_score(session, inputs))
return np.array(predictions)
def predict_with_score(self, session, inputs):
"""Predict the classification of the test set.
Args:
session: The current TensorFlow session.
inputs: the test paths, x, y and/or nc vectors
Returns:
The test predictions along with their scores.
"""
test_pred = [0] * len(inputs)
for chunk in xrange(0, len(test_pred), self.hparams.batch_size):
# Initialize the variables with the current batch data
batch_indices = list(
range(chunk, min(chunk + self.hparams.batch_size, len(test_pred))))
# If the batch is too small, add a few other examples
if len(batch_indices) < self.hparams.batch_size:
batch_indices += [0] * (self.hparams.batch_size-len(batch_indices))
batch_instances = [inputs[i] for i in batch_indices]
predictions, scores = session.run(
[self.predictions, self.scores],
feed_dict={self.instances: batch_instances})
for index_in_batch, index_in_dataset in enumerate(batch_indices):
prediction = predictions[index_in_batch]
score = scores[index_in_batch][prediction]
test_pred[index_in_dataset] = (prediction, score)
return test_pred
def __mlp__(self):
"""Performs the MLP operations.
Returns: the prediction object to be computed in a Session
"""
# Define the operations
# Network input
vec_inputs = []
# Distributional component
if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']:
for emb_id in [self.x_embedding_id, self.y_embedding_id]:
vec_inputs.append(tf.nn.embedding_lookup(self.relata_lookup, emb_id))
# Noun compound component
if self.hparams.input in ['dist-nc', 'integrated-nc']:
vec = tf.nn.embedding_lookup(self.nc_lookup, self.nc_embedding_id)
vec_inputs.append(vec)
# Path-based component
if self.hparams.input in ['path', 'integrated', 'integrated-nc']:
# Get the current paths for each batch instance
self.path_embeddings = tf.nn.embedding_lookup(self.path_lookup,
self.path_embedding_id)
# self.path_embeddings is of shape
# [batch_size, max_path_per_instance, output_dim]
# We need to multiply it by path counts
# ([batch_size, max_path_per_instance]).
# Start by duplicating path_counts along the output_dim axis.
self.path_freq = tf.tile(tf.expand_dims(self.path_counts, -1),
[1, 1, self.path_dim])
# Compute the averaged path vector for each instance.
# First, multiply the path embeddings and frequencies element-wise.
self.weighted = tf.multiply(self.path_freq, self.path_embeddings)
# Second, take the sum to get a tensor of shape [batch_size, output_dim].
self.pair_path_embeddings = tf.reduce_sum(self.weighted, 1)
# Finally, divide by the total number of paths.
# The number of paths for each pair has a shape [batch_size, 1],
# We duplicate it output_dim times along the second axis.
self.num_paths = tf.clip_by_value(
tf.reduce_sum(self.path_counts, 1), 1, np.inf)
self.num_paths = tf.tile(tf.expand_dims(self.num_paths, -1),
[1, self.path_dim])
# And finally, divide pair_path_embeddings by num_paths element-wise.
self.pair_path_embeddings = tf.div(
self.pair_path_embeddings, self.num_paths)
vec_inputs.append(self.pair_path_embeddings)
# Concatenate the inputs and feed to the MLP
self.input_vec = tf.nn.dropout(
tf.concat(vec_inputs, 1),
keep_prob=self.hparams.input_keep_prob)
h = tf.matmul(self.input_vec, self.weights1)
self.output = h
if self.hparams.hidden_layers == 1:
self.output = tf.matmul(tf.nn.tanh(h), self.weights2)
self.scores = self.output
self.predictions = tf.argmax(self.scores, axis=1)
# Define the loss function and the optimization algorithm
self.cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=self.scores, labels=self.labels)
self.cost = tf.reduce_sum(self.cross_entropies, name='cost')
self.global_step = tf.Variable(0, name='global_step', trainable=False)
self.optimizer = tf.train.AdamOptimizer()
self.train_op = self.optimizer.minimize(
self.cost, global_step=self.global_step)
def parse_tensorflow_examples(record, batch_size, path_to_index):
"""Reads TensorFlow examples from a RecordReader.
Args:
record: a record with TensorFlow examples.
batch_size: the number of instances in a minibatch
path_to_index: mapping from string path to index in the embeddings matrix.
Returns:
The word embeddings IDs, paths and counts
"""
features = tf.parse_example(
record, {
'x_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
'y_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
'nc_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64),
'reprs': tf.FixedLenSequenceFeature(
shape=(), dtype=tf.string, allow_missing=True),
'counts': tf.FixedLenSequenceFeature(
shape=(), dtype=tf.int64, allow_missing=True),
'rel_id': tf.FixedLenFeature([1], dtype=tf.int64)
})
x_embedding_id = tf.squeeze(features['x_embedding_id'], [-1])
y_embedding_id = tf.squeeze(features['y_embedding_id'], [-1])
nc_embedding_id = tf.squeeze(features['nc_embedding_id'], [-1])
labels = tf.squeeze(features['rel_id'], [-1])
path_counts = tf.to_float(tf.reshape(features['counts'], [batch_size, -1]))
path_embedding_id = None
if path_to_index:
path_embedding_id = path_to_index.lookup(features['reprs'])
return (
x_embedding_id, y_embedding_id, nc_embedding_id,
path_embedding_id, path_counts, labels)