# Copyright 2017, 2018 Google, Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """LexNET Path-based Model.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import collections import itertools import os import lexnet_common import numpy as np import tensorflow as tf class PathBasedModel(object): """The LexNET path-based model for classifying semantic relations.""" @classmethod def default_hparams(cls): """Returns the default hyper-parameters.""" return tf.contrib.training.HParams( max_path_len=8, num_classes=37, num_epochs=30, input_keep_prob=0.9, learning_rate=0.001, learn_lemmas=False, random_seed=133, # zero means no random seed lemma_embeddings_file='glove/glove.6B.50d.bin', num_pos=len(lexnet_common.POSTAGS), num_dep=len(lexnet_common.DEPLABELS), num_directions=len(lexnet_common.DIRS), lemma_dim=50, pos_dim=4, dep_dim=5, dir_dim=1) def __init__(self, hparams, lemma_embeddings, instance): """Initialize the LexNET classifier. Args: hparams: the hyper-parameters. lemma_embeddings: word embeddings for the path-based component. instance: string tensor containing the input instance """ self.hparams = hparams self.lemma_embeddings = lemma_embeddings self.instance = instance self.vocab_size, self.lemma_dim = self.lemma_embeddings.shape # Set the random seed if hparams.random_seed > 0: tf.set_random_seed(hparams.random_seed) # Create the network self.__create_computation_graph__() def __create_computation_graph__(self): """Initialize the model and define the graph.""" self.lstm_input_dim = sum([self.hparams.lemma_dim, self.hparams.pos_dim, self.hparams.dep_dim, self.hparams.dir_dim]) self.lstm_output_dim = self.lstm_input_dim network_input = self.lstm_output_dim self.lemma_lookup = tf.get_variable( 'lemma_lookup', initializer=self.lemma_embeddings, dtype=tf.float32, trainable=self.hparams.learn_lemmas) self.pos_lookup = tf.get_variable( 'pos_lookup', shape=[self.hparams.num_pos, self.hparams.pos_dim], dtype=tf.float32) self.dep_lookup = tf.get_variable( 'dep_lookup', shape=[self.hparams.num_dep, self.hparams.dep_dim], dtype=tf.float32) self.dir_lookup = tf.get_variable( 'dir_lookup', shape=[self.hparams.num_directions, self.hparams.dir_dim], dtype=tf.float32) self.weights1 = tf.get_variable( 'W1', shape=[network_input, self.hparams.num_classes], dtype=tf.float32) self.bias1 = tf.get_variable( 'b1', shape=[self.hparams.num_classes], dtype=tf.float32) # Define the variables (self.batch_paths, self.path_counts, self.seq_lengths, self.path_strings, self.batch_labels) = _parse_tensorflow_example( self.instance, self.hparams.max_path_len, self.hparams.input_keep_prob) # Create the LSTM self.__lstm__() # Create the MLP self.__mlp__() self.instances_to_load = tf.placeholder(dtype=tf.string, shape=[None]) self.labels_to_load = lexnet_common.load_all_labels(self.instances_to_load) def load_labels(self, session, batch_instances): """Loads the labels of the current instances. Args: session: the current TensorFlow session. batch_instances: the dataset instances. Returns: the labels. """ return session.run(self.labels_to_load, feed_dict={self.instances_to_load: batch_instances}) def run_one_epoch(self, session, num_steps): """Train the model. Args: session: The current TensorFlow session. num_steps: The number of steps in each epoch. Returns: The mean loss for the epoch. Raises: ArithmeticError: if the loss becomes non-finite. """ losses = [] for step in range(num_steps): curr_loss, _ = session.run([self.cost, self.train_op]) if not np.isfinite(curr_loss): raise ArithmeticError('nan loss at step %d' % step) losses.append(curr_loss) return np.mean(losses) def predict(self, session, inputs): """Predict the classification of the test set. Args: session: The current TensorFlow session. inputs: the train paths, x, y and/or nc vectors Returns: The test predictions. """ predictions, _ = zip(*self.predict_with_score(session, inputs)) return np.array(predictions) def predict_with_score(self, session, inputs): """Predict the classification of the test set. Args: session: The current TensorFlow session. inputs: the test paths, x, y and/or nc vectors Returns: The test predictions along with their scores. """ test_pred = [0] * len(inputs) for index, instance in enumerate(inputs): prediction, scores = session.run( [self.predictions, self.scores], feed_dict={self.instance: instance}) test_pred[index] = (prediction, scores[prediction]) return test_pred def __mlp__(self): """Performs the MLP operations. Returns: the prediction object to be computed in a Session """ # Feed the paths to the MLP: path_embeddings is # [num_batch_paths, output_dim], and when we multiply it by W # ([output_dim, num_classes]), we get a matrix of class distributions: # [num_batch_paths, num_classes]. self.distributions = tf.matmul(self.path_embeddings, self.weights1) # Now, compute weighted average on the class distributions, using the path # frequency as weights. # First, reshape path_freq to the same shape of distributions self.path_freq = tf.tile(tf.expand_dims(self.path_counts, -1), [1, self.hparams.num_classes]) # Second, multiply the distributions and frequencies element-wise. self.weighted = tf.multiply(self.path_freq, self.distributions) # Finally, take the average to get a tensor of shape [1, num_classes]. self.weighted_sum = tf.reduce_sum(self.weighted, 0) self.num_paths = tf.clip_by_value(tf.reduce_sum(self.path_counts), 1, np.inf) self.num_paths = tf.tile(tf.expand_dims(self.num_paths, -1), [self.hparams.num_classes]) self.scores = tf.div(self.weighted_sum, self.num_paths) self.predictions = tf.argmax(self.scores) # Define the loss function and the optimization algorithm self.cross_entropies = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.scores, labels=tf.reduce_mean(self.batch_labels)) self.cost = tf.reduce_sum(self.cross_entropies, name='cost') self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer() self.train_op = self.optimizer.minimize(self.cost, global_step=self.global_step) def __lstm__(self): """Defines the LSTM operations. Returns: A matrix of path embeddings. """ lookup_tables = [self.lemma_lookup, self.pos_lookup, self.dep_lookup, self.dir_lookup] # Split the edges to components: list of 4 tensors # [num_batch_paths, max_path_len, 1] self.edge_components = tf.split(self.batch_paths, 4, axis=2) # Look up the components embeddings and concatenate them back together self.path_matrix = tf.concat([ tf.squeeze(tf.nn.embedding_lookup(lookup_table, component), 2) for lookup_table, component in zip(lookup_tables, self.edge_components) ], axis=2) self.sequence_lengths = tf.reshape(self.seq_lengths, [-1]) # Define the LSTM. # The input is [num_batch_paths, max_path_len, input_dim]. lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.lstm_output_dim) # The output is [num_batch_paths, max_path_len, output_dim]. self.lstm_outputs, _ = tf.nn.dynamic_rnn( lstm_cell, self.path_matrix, dtype=tf.float32, sequence_length=self.sequence_lengths) # Slice the last *relevant* output for each instance -> # [num_batch_paths, output_dim] self.path_embeddings = _extract_last_relevant(self.lstm_outputs, self.sequence_lengths) def _parse_tensorflow_example(record, max_path_len, input_keep_prob): """Reads TensorFlow examples from a RecordReader. Args: record: a record with TensorFlow example. max_path_len: the maximum path length. input_keep_prob: 1 - the word dropout probability Returns: The paths and counts """ features = tf.parse_single_example(record, { 'lemmas': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'postags': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'deplabels': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'dirs': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'counts': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'pathlens': tf.FixedLenSequenceFeature( shape=(), dtype=tf.int64, allow_missing=True), 'reprs': tf.FixedLenSequenceFeature( shape=(), dtype=tf.string, allow_missing=True), 'rel_id': tf.FixedLenFeature([], dtype=tf.int64) }) path_counts = tf.to_float(features['counts']) seq_lengths = features['pathlens'] # Concatenate the edge components to create a path tensor: # [max_paths_per_ins, max_path_length, 4] lemmas = _word_dropout( tf.reshape(features['lemmas'], [-1, max_path_len]), input_keep_prob) paths = tf.stack( [lemmas] + [ tf.reshape(features[f], [-1, max_path_len]) for f in ('postags', 'deplabels', 'dirs') ], axis=-1) path_strings = features['reprs'] # Add an empty path to pairs with no paths paths = tf.cond( tf.shape(paths)[0] > 0, lambda: paths, lambda: tf.zeros([1, max_path_len, 4], dtype=tf.int64)) # Paths are left-padded. We reverse them to make them right-padded. #paths = tf.reverse(paths, axis=[1]) path_counts = tf.cond( tf.shape(path_counts)[0] > 0, lambda: path_counts, lambda: tf.constant([1.0], dtype=tf.float32)) seq_lengths = tf.cond( tf.shape(seq_lengths)[0] > 0, lambda: seq_lengths, lambda: tf.constant([1], dtype=tf.int64)) # Duplicate the label for each path labels = tf.ones_like(path_counts, dtype=tf.int64) * features['rel_id'] return paths, path_counts, seq_lengths, path_strings, labels def _extract_last_relevant(output, seq_lengths): """Get the last relevant LSTM output cell for each batch instance. Args: output: the LSTM outputs - a tensor with shape [num_paths, output_dim, max_path_len] seq_lengths: the sequences length per instance Returns: The last relevant LSTM output cell for each batch instance. """ max_length = int(output.get_shape()[1]) path_lengths = tf.clip_by_value(seq_lengths - 1, 0, max_length) relevant = tf.reduce_sum(tf.multiply(output, tf.expand_dims( tf.one_hot(path_lengths, max_length), -1)), 1) return relevant def _word_dropout(words, input_keep_prob): """Drops words with probability 1 - input_keep_prob. Args: words: a list of lemmas from the paths. input_keep_prob: the probability to keep the word. Returns: The revised list where some of the words are ed. """ # Create the mask: (-1) to drop, 1 to keep prob = tf.random_uniform(tf.shape(words), 0, 1) condition = tf.less(prob, (1 - input_keep_prob)) mask = tf.where(condition, tf.negative(tf.ones_like(words)), tf.ones_like(words)) # We need to keep zeros (), and change other numbers to 1 () # if their mask is -1. First, we multiply the mask and the words. # Zeros will stay zeros, and words to drop will become negative. # Then, we change negative values to 1. masked_words = tf.multiply(mask, words) condition = tf.less(masked_words, 0) dropped_words = tf.where(condition, tf.ones_like(words), words) return dropped_words def compute_path_embeddings(model, session, instances): """Compute the path embeddings for all the distinct paths. Args: model: The trained path-based model. session: The current TensorFlow session. instances: All the train, test and validation instances. Returns: The path to ID index and the path embeddings. """ # Get an index for each distinct path path_index = collections.defaultdict(itertools.count(0).next) path_vectors = {} for instance in instances: curr_path_embeddings, curr_path_strings = session.run( [model.path_embeddings, model.path_strings], feed_dict={model.instance: instance}) for i, path in enumerate(curr_path_strings): if not path: continue # Set a new/existing index for the path index = path_index[path] # Save its vector path_vectors[index] = curr_path_embeddings[i, :] print('Number of distinct paths: %d' % len(path_index)) return path_index, path_vectors def save_path_embeddings(model, path_vectors, path_index, embeddings_base_path): """Saves the path embeddings. Args: model: The trained path-based model. path_vectors: The path embeddings. path_index: A map from path to ID. embeddings_base_path: The base directory where the embeddings are. """ index_range = range(max(path_index.values()) + 1) path_matrix = [path_vectors[i] for i in index_range] path_matrix = np.vstack(path_matrix) # Save the path embeddings path_vector_filename = os.path.join( embeddings_base_path, '%d_path_vectors' % model.lstm_output_dim) with open(path_vector_filename, 'w') as f_out: np.save(f_out, path_matrix) index_to_path = {i: p for p, i in path_index.iteritems()} path_vocab = [index_to_path[i] for i in index_range] # Save the path vocabulary path_vocab_filename = os.path.join( embeddings_base_path, '%d_path_vocab' % model.lstm_output_dim) with open(path_vocab_filename, 'w') as f_out: f_out.write('\n'.join(path_vocab)) f_out.write('\n') print('Saved path embeddings.') def load_path_embeddings(path_embeddings_dir, path_dim): """Loads pretrained path embeddings from a binary file and returns the matrix. Args: path_embeddings_dir: The directory for the path embeddings. path_dim: The dimension of the path embeddings, used as prefix to the path_vocab and path_vectors files. Returns: The path embeddings matrix and the path_to_index dictionary. """ prefix = path_embeddings_dir + '/%d' % path_dim + '_' with open(prefix + 'path_vocab') as f_in: vocab = f_in.read().splitlines() vocab_size = len(vocab) embedding_file = prefix + 'path_vectors' print('Embedding file "%s" has %d paths' % (embedding_file, vocab_size)) with open(embedding_file) as f_in: embeddings = np.load(f_in) path_to_index = {p: i for i, p in enumerate(vocab)} return embeddings, path_to_index def get_indicative_paths(model, session, path_index, path_vectors, classes, save_dir, k=20, threshold=0.8): """Gets the most indicative paths for each class. Args: model: The trained path-based model. session: The current TensorFlow session. path_index: A map from path to ID. path_vectors: The path embeddings. classes: The class label names. save_dir: Where to save the paths. k: The k for top-k paths. threshold: The threshold above which to consider paths as indicative. """ # Define graph variables for this operation p_path_embedding = tf.placeholder(dtype=tf.float32, shape=[1, model.lstm_output_dim]) p_distributions = tf.nn.softmax(tf.matmul(p_path_embedding, model.weights1)) # Treat each path as a pair instance with a single path, and get the # relation distribution for it. Then, take the top paths for each relation. # This dictionary contains a relation as a key, and the value is a list of # tuples of path index and score. A relation r will contain (p, s) if the # path p is classified to r with a confidence of s. prediction_per_relation = collections.defaultdict(list) index_to_path = {i: p for p, i in path_index.iteritems()} # Predict all the paths for index in range(len(path_index)): curr_path_vector = path_vectors[index] distribution = session.run(p_distributions, feed_dict={ p_path_embedding: np.reshape( curr_path_vector, [1, model.lstm_output_dim])}) distribution = distribution[0, :] prediction = np.argmax(distribution) prediction_per_relation[prediction].append( (index, distribution[prediction])) if index % 10000 == 0: print('Classified %d/%d (%3.2f%%) of the paths' % ( index, len(path_index), 100 * index / len(path_index))) # Retrieve k-best scoring paths for each relation for relation_index, relation in enumerate(classes): curr_paths = sorted(prediction_per_relation[relation_index], key=lambda item: item[1], reverse=True) above_t = [(p, s) for (p, s) in curr_paths if s >= threshold] top_k = curr_paths[k+1] relation_paths = above_t if len(above_t) > len(top_k) else top_k paths_filename = os.path.join(save_dir, '%s.paths' % relation) with open(paths_filename, 'w') as f_out: for index, score in relation_paths: print('\t'.join([index_to_path[index], str(score)]), file=f_out)