# Copyright 2017, 2018 Google, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """The integrated LexNET model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import lexnet_common import numpy as np import tensorflow as tf from six.moves import xrange class LexNETModel(object): """The LexNET model for classifying relationships between noun compounds.""" @classmethod def default_hparams(cls): """Returns the default hyper-parameters.""" return tf.contrib.training.HParams( batch_size=10, num_classes=37, num_epochs=30, input_keep_prob=0.9, input='integrated', # dist/ dist-nc/ path/ integrated/ integrated-nc learn_relata=False, corpus='wiki_gigawords', random_seed=133, # zero means no random seed relata_embeddings_file='glove/glove.6B.300d.bin', nc_embeddings_file='nc_glove/vecs.6B.300d.bin', path_embeddings_file='path_embeddings/tratz/fine_grained/wiki', hidden_layers=1, path_dim=60) def __init__(self, hparams, relata_embeddings, path_embeddings, nc_embeddings, path_to_index): """Initialize the LexNET classifier. Args: hparams: the hyper-parameters. relata_embeddings: word embeddings for the distributional component. path_embeddings: embeddings for the paths. nc_embeddings: noun compound embeddings. path_to_index: a mapping from string path to an index in the path embeddings matrix. """ self.hparams = hparams self.path_embeddings = path_embeddings self.relata_embeddings = relata_embeddings self.nc_embeddings = nc_embeddings self.vocab_size, self.relata_dim = 0, 0 self.path_to_index = None self.path_dim = 0 # Set the random seed if hparams.random_seed > 0: tf.set_random_seed(hparams.random_seed) # Get the vocabulary size and relata dim if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']: self.vocab_size, self.relata_dim = self.relata_embeddings.shape # Create the mapping from string path to an index in the embeddings matrix if self.hparams.input in ['path', 'integrated', 'integrated-nc']: self.path_to_index = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer( tf.constant(path_to_index.keys()), tf.constant(path_to_index.values()), key_dtype=tf.string, value_dtype=tf.int32), 0) self.path_dim = self.path_embeddings.shape[1] # Create the network self.__create_computation_graph__() def __create_computation_graph__(self): """Initialize the model and define the graph.""" network_input = 0 # Define the network inputs # Distributional x and y if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']: network_input += 2 * self.relata_dim self.relata_lookup = tf.get_variable( 'relata_lookup', initializer=self.relata_embeddings, dtype=tf.float32, trainable=self.hparams.learn_relata) # Path-based if self.hparams.input in ['path', 'integrated', 'integrated-nc']: network_input += self.path_dim self.path_initial_value_t = tf.placeholder(tf.float32, None) self.path_lookup = tf.get_variable( name='path_lookup', dtype=tf.float32, trainable=False, shape=self.path_embeddings.shape) self.initialize_path_op = tf.assign( self.path_lookup, self.path_initial_value_t, validate_shape=False) # Distributional noun compound if self.hparams.input in ['dist-nc', 'integrated-nc']: network_input += self.relata_dim self.nc_initial_value_t = tf.placeholder(tf.float32, None) self.nc_lookup = tf.get_variable( name='nc_lookup', dtype=tf.float32, trainable=False, shape=self.nc_embeddings.shape) self.initialize_nc_op = tf.assign( self.nc_lookup, self.nc_initial_value_t, validate_shape=False) hidden_dim = network_input // 2 # Define the MLP if self.hparams.hidden_layers == 0: self.weights1 = tf.get_variable( 'W1', shape=[network_input, self.hparams.num_classes], dtype=tf.float32) self.bias1 = tf.get_variable( 'b1', shape=[self.hparams.num_classes], dtype=tf.float32) elif self.hparams.hidden_layers == 1: self.weights1 = tf.get_variable( 'W1', shape=[network_input, hidden_dim], dtype=tf.float32) self.bias1 = tf.get_variable( 'b1', shape=[hidden_dim], dtype=tf.float32) self.weights2 = tf.get_variable( 'W2', shape=[hidden_dim, self.hparams.num_classes], dtype=tf.float32) self.bias2 = tf.get_variable( 'b2', shape=[self.hparams.num_classes], dtype=tf.float32) else: raise ValueError('Only 0 or 1 hidden layers are supported') # Define the variables self.instances = tf.placeholder(dtype=tf.string, shape=[self.hparams.batch_size]) (self.x_embedding_id, self.y_embedding_id, self.nc_embedding_id, self.path_embedding_id, self.path_counts, self.labels) = parse_tensorflow_examples( self.instances, self.hparams.batch_size, self.path_to_index) # Create the MLP self.__mlp__() self.instances_to_load = tf.placeholder(dtype=tf.string, shape=[None]) self.labels_to_load = lexnet_common.load_all_labels(self.instances_to_load) self.pairs_to_load = lexnet_common.load_all_pairs(self.instances_to_load) def load_labels(self, session, instances): """Loads the labels for these instances. Args: session: The current TensorFlow session, instances: The instances for which to load the labels. Returns: the labels of these instances. """ return session.run(self.labels_to_load, feed_dict={self.instances_to_load: instances}) def load_pairs(self, session, instances): """Loads the word pairs for these instances. Args: session: The current TensorFlow session, instances: The instances for which to load the labels. Returns: the word pairs of these instances. """ word_pairs = session.run(self.pairs_to_load, feed_dict={self.instances_to_load: instances}) return [pair[0].split('::') for pair in word_pairs] def __train_single_batch__(self, session, batch_instances): """Train a single batch. Args: session: The current TensorFlow session. batch_instances: TensorFlow examples containing the training intances Returns: The cost for the current batch. """ cost, _ = session.run([self.cost, self.train_op], feed_dict={self.instances: batch_instances}) return cost def fit(self, session, inputs, on_epoch_completed, val_instances, val_labels, save_path): """Train the model. Args: session: The current TensorFlow session. inputs: on_epoch_completed: A method to call after each epoch. val_instances: The validation set instances (evaluation between epochs). val_labels: The validation set labels (for evaluation between epochs). save_path: Where to save the model. """ for epoch in range(self.hparams.num_epochs): losses = [] epoch_indices = list(np.random.permutation(len(inputs))) # If the number of instances doesn't divide by batch_size, enlarge it # by duplicating training examples mod = len(epoch_indices) % self.hparams.batch_size if mod > 0: epoch_indices.extend([np.random.randint(0, high=len(inputs))] * mod) # Define the batches n_batches = len(epoch_indices) // self.hparams.batch_size for minibatch in range(n_batches): batch_indices = epoch_indices[minibatch * self.hparams.batch_size:( minibatch + 1) * self.hparams.batch_size] batch_instances = [inputs[i] for i in batch_indices] loss = self.__train_single_batch__(session, batch_instances) losses.append(loss) epoch_loss = np.nanmean(losses) if on_epoch_completed: should_stop = on_epoch_completed(self, session, epoch, epoch_loss, val_instances, val_labels, save_path) if should_stop: print('Stopping training after %d epochs.' % epoch) return def predict(self, session, inputs): """Predict the classification of the test set. Args: session: The current TensorFlow session. inputs: the train paths, x, y and/or nc vectors Returns: The test predictions. """ predictions, _ = zip(*self.predict_with_score(session, inputs)) return np.array(predictions) def predict_with_score(self, session, inputs): """Predict the classification of the test set. Args: session: The current TensorFlow session. inputs: the test paths, x, y and/or nc vectors Returns: The test predictions along with their scores. """ test_pred = [0] * len(inputs) for chunk in xrange(0, len(test_pred), self.hparams.batch_size): # Initialize the variables with the current batch data batch_indices = list( range(chunk, min(chunk + self.hparams.batch_size, len(test_pred)))) # If the batch is too small, add a few other examples if len(batch_indices) < self.hparams.batch_size: batch_indices += [0] * (self.hparams.batch_size-len(batch_indices)) batch_instances = [inputs[i] for i in batch_indices] predictions, scores = session.run( [self.predictions, self.scores], feed_dict={self.instances: batch_instances}) for index_in_batch, index_in_dataset in enumerate(batch_indices): prediction = predictions[index_in_batch] score = scores[index_in_batch][prediction] test_pred[index_in_dataset] = (prediction, score) return test_pred def __mlp__(self): """Performs the MLP operations. Returns: the prediction object to be computed in a Session """ # Define the operations # Network input vec_inputs = [] # Distributional component if self.hparams.input in ['dist', 'dist-nc', 'integrated', 'integrated-nc']: for emb_id in [self.x_embedding_id, self.y_embedding_id]: vec_inputs.append(tf.nn.embedding_lookup(self.relata_lookup, emb_id)) # Noun compound component if self.hparams.input in ['dist-nc', 'integrated-nc']: vec = tf.nn.embedding_lookup(self.nc_lookup, self.nc_embedding_id) vec_inputs.append(vec) # Path-based component if self.hparams.input in ['path', 'integrated', 'integrated-nc']: # Get the current paths for each batch instance self.path_embeddings = tf.nn.embedding_lookup(self.path_lookup, self.path_embedding_id) # self.path_embeddings is of shape # [batch_size, max_path_per_instance, output_dim] # We need to multiply it by path counts # ([batch_size, max_path_per_instance]). # Start by duplicating path_counts along the output_dim axis. self.path_freq = tf.tile(tf.expand_dims(self.path_counts, -1), [1, 1, self.path_dim]) # Compute the averaged path vector for each instance. # First, multiply the path embeddings and frequencies element-wise. self.weighted = tf.multiply(self.path_freq, self.path_embeddings) # Second, take the sum to get a tensor of shape [batch_size, output_dim]. self.pair_path_embeddings = tf.reduce_sum(self.weighted, 1) # Finally, divide by the total number of paths. # The number of paths for each pair has a shape [batch_size, 1], # We duplicate it output_dim times along the second axis. self.num_paths = tf.clip_by_value( tf.reduce_sum(self.path_counts, 1), 1, np.inf) self.num_paths = tf.tile(tf.expand_dims(self.num_paths, -1), [1, self.path_dim]) # And finally, divide pair_path_embeddings by num_paths element-wise. self.pair_path_embeddings = tf.div( self.pair_path_embeddings, self.num_paths) vec_inputs.append(self.pair_path_embeddings) # Concatenate the inputs and feed to the MLP self.input_vec = tf.nn.dropout( tf.concat(vec_inputs, 1), keep_prob=self.hparams.input_keep_prob) h = tf.matmul(self.input_vec, self.weights1) self.output = h if self.hparams.hidden_layers == 1: self.output = tf.matmul(tf.nn.tanh(h), self.weights2) self.scores = self.output self.predictions = tf.argmax(self.scores, axis=1) # Define the loss function and the optimization algorithm self.cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.scores, labels=self.labels) self.cost = tf.reduce_sum(self.cross_entropies, name='cost') self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer() self.train_op = self.optimizer.minimize( self.cost, global_step=self.global_step) def parse_tensorflow_examples(record, batch_size, path_to_index): """Reads TensorFlow examples from a RecordReader. Args: record: a record with TensorFlow examples. batch_size: the number of instances in a minibatch path_to_index: mapping from string path to index in the embeddings matrix. Returns: The word embeddings IDs, paths and counts """ features = tf.parse_example( record, { 'x_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64), 'y_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64), 'nc_embedding_id': tf.FixedLenFeature([1], dtype=tf.int64), 'reprs': tf.FixedLenSequenceFeature( shape=(), dtype=tf.string, allow_missing=True), 'counts': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'rel_id': tf.FixedLenFeature([1], dtype=tf.int64) }) x_embedding_id = tf.squeeze(features['x_embedding_id'], [-1]) y_embedding_id = tf.squeeze(features['y_embedding_id'], [-1]) nc_embedding_id = tf.squeeze(features['nc_embedding_id'], [-1]) labels = tf.squeeze(features['rel_id'], [-1]) path_counts = tf.to_float(tf.reshape(features['counts'], [batch_size, -1])) path_embedding_id = None if path_to_index: path_embedding_id = path_to_index.lookup(features['reprs']) return ( x_embedding_id, y_embedding_id, nc_embedding_id, path_embedding_id, path_counts, labels)