Spaces:
No application file
No application file
# coding=utf-8 | |
# Copyright 2018 The Google AI Team Authors. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Utility functions for RACE dataset.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
# from __future__ import google_type_annotations | |
from __future__ import print_function | |
import collections | |
import json | |
import os | |
from albert import classifier_utils | |
from albert import fine_tuning_utils | |
from albert import modeling | |
from albert import optimization | |
from albert import tokenization | |
import tensorflow.compat.v1 as tf | |
from tensorflow.contrib import tpu as contrib_tpu | |
class InputExample(object): | |
"""A single training/test example for the RACE dataset.""" | |
def __init__(self, | |
example_id, | |
context_sentence, | |
start_ending, | |
endings, | |
label=None): | |
self.example_id = example_id | |
self.context_sentence = context_sentence | |
self.start_ending = start_ending | |
self.endings = endings | |
self.label = label | |
def __str__(self): | |
return self.__repr__() | |
def __repr__(self): | |
l = [ | |
"id: {}".format(self.example_id), | |
"context_sentence: {}".format(self.context_sentence), | |
"start_ending: {}".format(self.start_ending), | |
"ending_0: {}".format(self.endings[0]), | |
"ending_1: {}".format(self.endings[1]), | |
"ending_2: {}".format(self.endings[2]), | |
"ending_3: {}".format(self.endings[3]), | |
] | |
if self.label is not None: | |
l.append("label: {}".format(self.label)) | |
return ", ".join(l) | |
class RaceProcessor(object): | |
"""Processor for the RACE data set.""" | |
def __init__(self, use_spm, do_lower_case, high_only, middle_only): | |
super(RaceProcessor, self).__init__() | |
self.use_spm = use_spm | |
self.do_lower_case = do_lower_case | |
self.high_only = high_only | |
self.middle_only = middle_only | |
def get_train_examples(self, data_dir): | |
"""Gets a collection of `InputExample`s for the train set.""" | |
return self.read_examples( | |
os.path.join(data_dir, "RACE", "train")) | |
def get_dev_examples(self, data_dir): | |
"""Gets a collection of `InputExample`s for the dev set.""" | |
return self.read_examples( | |
os.path.join(data_dir, "RACE", "dev")) | |
def get_test_examples(self, data_dir): | |
"""Gets a collection of `InputExample`s for prediction.""" | |
return self.read_examples( | |
os.path.join(data_dir, "RACE", "test")) | |
def get_labels(self): | |
"""Gets the list of labels for this data set.""" | |
return ["A", "B", "C", "D"] | |
def process_text(self, text): | |
if self.use_spm: | |
return tokenization.preprocess_text(text, lower=self.do_lower_case) | |
else: | |
return tokenization.convert_to_unicode(text) | |
def read_examples(self, data_dir): | |
"""Read examples from RACE json files.""" | |
examples = [] | |
for level in ["middle", "high"]: | |
if level == "middle" and self.high_only: continue | |
if level == "high" and self.middle_only: continue | |
cur_dir = os.path.join(data_dir, level) | |
cur_path = os.path.join(cur_dir, "all.txt") | |
with tf.gfile.Open(cur_path) as f: | |
for line in f: | |
cur_data = json.loads(line.strip()) | |
answers = cur_data["answers"] | |
options = cur_data["options"] | |
questions = cur_data["questions"] | |
context = self.process_text(cur_data["article"]) | |
for i in range(len(answers)): | |
label = ord(answers[i]) - ord("A") | |
qa_list = [] | |
question = self.process_text(questions[i]) | |
for j in range(4): | |
option = self.process_text(options[i][j]) | |
if "_" in question: | |
qa_cat = question.replace("_", option) | |
else: | |
qa_cat = " ".join([question, option]) | |
qa_list.append(qa_cat) | |
examples.append( | |
InputExample( | |
example_id=cur_data["id"], | |
context_sentence=context, | |
start_ending=None, | |
endings=[qa_list[0], qa_list[1], qa_list[2], qa_list[3]], | |
label=label | |
) | |
) | |
return examples | |
def convert_single_example(example_index, example, label_size, max_seq_length, | |
tokenizer, max_qa_length): | |
"""Loads a data file into a list of `InputBatch`s.""" | |
# RACE is a multiple choice task. To perform this task using AlBERT, | |
# we will use the formatting proposed in "Improving Language | |
# Understanding by Generative Pre-Training" and suggested by | |
# @jacobdevlin-google in this issue | |
# https://github.com/google-research/bert/issues/38. | |
# | |
# Each choice will correspond to a sample on which we run the | |
# inference. For a given RACE example, we will create the 4 | |
# following inputs: | |
# - [CLS] context [SEP] choice_1 [SEP] | |
# - [CLS] context [SEP] choice_2 [SEP] | |
# - [CLS] context [SEP] choice_3 [SEP] | |
# - [CLS] context [SEP] choice_4 [SEP] | |
# The model will output a single value for each input. To get the | |
# final decision of the model, we will run a softmax over these 4 | |
# outputs. | |
if isinstance(example, classifier_utils.PaddingInputExample): | |
return classifier_utils.InputFeatures( | |
example_id=0, | |
input_ids=[[0] * max_seq_length] * label_size, | |
input_mask=[[0] * max_seq_length] * label_size, | |
segment_ids=[[0] * max_seq_length] * label_size, | |
label_id=0, | |
is_real_example=False) | |
else: | |
context_tokens = tokenizer.tokenize(example.context_sentence) | |
if example.start_ending is not None: | |
start_ending_tokens = tokenizer.tokenize(example.start_ending) | |
all_input_tokens = [] | |
all_input_ids = [] | |
all_input_mask = [] | |
all_segment_ids = [] | |
for ending in example.endings: | |
# We create a copy of the context tokens in order to be | |
# able to shrink it according to ending_tokens | |
context_tokens_choice = context_tokens[:] | |
if example.start_ending is not None: | |
ending_tokens = start_ending_tokens + tokenizer.tokenize(ending) | |
else: | |
ending_tokens = tokenizer.tokenize(ending) | |
# Modifies `context_tokens_choice` and `ending_tokens` in | |
# place so that the total length is less than the | |
# specified length. Account for [CLS], [SEP], [SEP] with | |
# "- 3" | |
ending_tokens = ending_tokens[- max_qa_length:] | |
if len(context_tokens_choice) + len(ending_tokens) > max_seq_length - 3: | |
context_tokens_choice = context_tokens_choice[: ( | |
max_seq_length - 3 - len(ending_tokens))] | |
tokens = ["[CLS]"] + context_tokens_choice + ( | |
["[SEP]"] + ending_tokens + ["[SEP]"]) | |
segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * ( | |
len(ending_tokens) + 1) | |
input_ids = tokenizer.convert_tokens_to_ids(tokens) | |
input_mask = [1] * len(input_ids) | |
# Zero-pad up to the sequence length. | |
padding = [0] * (max_seq_length - len(input_ids)) | |
input_ids += padding | |
input_mask += padding | |
segment_ids += padding | |
assert len(input_ids) == max_seq_length | |
assert len(input_mask) == max_seq_length | |
assert len(segment_ids) == max_seq_length | |
all_input_tokens.append(tokens) | |
all_input_ids.append(input_ids) | |
all_input_mask.append(input_mask) | |
all_segment_ids.append(segment_ids) | |
label = example.label | |
if example_index < 5: | |
tf.logging.info("*** Example ***") | |
tf.logging.info("id: {}".format(example.example_id)) | |
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in \ | |
enumerate(zip(all_input_tokens, all_input_ids, all_input_mask, all_segment_ids)): | |
tf.logging.info("choice: {}".format(choice_idx)) | |
tf.logging.info("tokens: {}".format(" ".join(tokens))) | |
tf.logging.info( | |
"input_ids: {}".format(" ".join(map(str, input_ids)))) | |
tf.logging.info( | |
"input_mask: {}".format(" ".join(map(str, input_mask)))) | |
tf.logging.info( | |
"segment_ids: {}".format(" ".join(map(str, segment_ids)))) | |
tf.logging.info("label: {}".format(label)) | |
return classifier_utils.InputFeatures( | |
example_id=example.example_id, | |
input_ids=all_input_ids, | |
input_mask=all_input_mask, | |
segment_ids=all_segment_ids, | |
label_id=label | |
) | |
def file_based_convert_examples_to_features( | |
examples, label_list, max_seq_length, tokenizer, | |
output_file, max_qa_length): | |
"""Convert a set of `InputExample`s to a TFRecord file.""" | |
writer = tf.python_io.TFRecordWriter(output_file) | |
for (ex_index, example) in enumerate(examples): | |
if ex_index % 10000 == 0: | |
tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) | |
feature = convert_single_example(ex_index, example, len(label_list), | |
max_seq_length, tokenizer, max_qa_length) | |
def create_int_feature(values): | |
f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) | |
return f | |
features = collections.OrderedDict() | |
features["input_ids"] = create_int_feature(sum(feature.input_ids, [])) | |
features["input_mask"] = create_int_feature(sum(feature.input_mask, [])) | |
features["segment_ids"] = create_int_feature(sum(feature.segment_ids, [])) | |
features["label_ids"] = create_int_feature([feature.label_id]) | |
features["is_real_example"] = create_int_feature( | |
[int(feature.is_real_example)]) | |
tf_example = tf.train.Example(features=tf.train.Features(feature=features)) | |
writer.write(tf_example.SerializeToString()) | |
writer.close() | |
def create_model(albert_config, is_training, input_ids, input_mask, segment_ids, | |
labels, num_labels, use_one_hot_embeddings, max_seq_length, | |
dropout_prob, hub_module): | |
"""Creates a classification model.""" | |
bsz_per_core = tf.shape(input_ids)[0] | |
input_ids = tf.reshape(input_ids, [bsz_per_core * num_labels, max_seq_length]) | |
input_mask = tf.reshape(input_mask, | |
[bsz_per_core * num_labels, max_seq_length]) | |
token_type_ids = tf.reshape(segment_ids, | |
[bsz_per_core * num_labels, max_seq_length]) | |
(output_layer, _) = fine_tuning_utils.create_albert( | |
albert_config=albert_config, | |
is_training=is_training, | |
input_ids=input_ids, | |
input_mask=input_mask, | |
segment_ids=token_type_ids, | |
use_one_hot_embeddings=use_one_hot_embeddings, | |
use_einsum=True, | |
hub_module=hub_module) | |
hidden_size = output_layer.shape[-1].value | |
output_weights = tf.get_variable( | |
"output_weights", [1, hidden_size], | |
initializer=tf.truncated_normal_initializer(stddev=0.02)) | |
output_bias = tf.get_variable( | |
"output_bias", [1], | |
initializer=tf.zeros_initializer()) | |
with tf.variable_scope("loss"): | |
if is_training: | |
# I.e., 0.1 dropout | |
output_layer = tf.nn.dropout( | |
output_layer, keep_prob=1 - dropout_prob) | |
logits = tf.matmul(output_layer, output_weights, transpose_b=True) | |
logits = tf.nn.bias_add(logits, output_bias) | |
logits = tf.reshape(logits, [bsz_per_core, num_labels]) | |
probabilities = tf.nn.softmax(logits, axis=-1) | |
predictions = tf.argmax(probabilities, axis=-1, output_type=tf.int32) | |
log_probs = tf.nn.log_softmax(logits, axis=-1) | |
one_hot_labels = tf.one_hot( | |
labels, depth=tf.cast(num_labels, dtype=tf.int32), dtype=tf.float32) | |
per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) | |
loss = tf.reduce_mean(per_example_loss) | |
return (loss, per_example_loss, probabilities, logits, predictions) | |
def model_fn_builder(albert_config, num_labels, init_checkpoint, learning_rate, | |
num_train_steps, num_warmup_steps, use_tpu, | |
use_one_hot_embeddings, max_seq_length, dropout_prob, | |
hub_module): | |
"""Returns `model_fn` closure for TPUEstimator.""" | |
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument | |
"""The `model_fn` for TPUEstimator.""" | |
tf.logging.info("*** Features ***") | |
for name in sorted(features.keys()): | |
tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) | |
input_ids = features["input_ids"] | |
input_mask = features["input_mask"] | |
segment_ids = features["segment_ids"] | |
label_ids = features["label_ids"] | |
is_real_example = None | |
if "is_real_example" in features: | |
is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) | |
else: | |
is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) | |
is_training = (mode == tf.estimator.ModeKeys.TRAIN) | |
(total_loss, per_example_loss, probabilities, logits, predictions) = \ | |
create_model(albert_config, is_training, input_ids, input_mask, | |
segment_ids, label_ids, num_labels, | |
use_one_hot_embeddings, max_seq_length, dropout_prob, | |
hub_module) | |
tvars = tf.trainable_variables() | |
initialized_variable_names = {} | |
scaffold_fn = None | |
if init_checkpoint: | |
(assignment_map, initialized_variable_names | |
) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) | |
if use_tpu: | |
def tpu_scaffold(): | |
tf.train.init_from_checkpoint(init_checkpoint, assignment_map) | |
return tf.train.Scaffold() | |
scaffold_fn = tpu_scaffold | |
else: | |
tf.train.init_from_checkpoint(init_checkpoint, assignment_map) | |
tf.logging.info("**** Trainable Variables ****") | |
for var in tvars: | |
init_string = "" | |
if var.name in initialized_variable_names: | |
init_string = ", *INIT_FROM_CKPT*" | |
tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, | |
init_string) | |
output_spec = None | |
if mode == tf.estimator.ModeKeys.TRAIN: | |
train_op = optimization.create_optimizer( | |
total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) | |
output_spec = contrib_tpu.TPUEstimatorSpec( | |
mode=mode, | |
loss=total_loss, | |
train_op=train_op, | |
scaffold_fn=scaffold_fn) | |
elif mode == tf.estimator.ModeKeys.EVAL: | |
def metric_fn(per_example_loss, label_ids, logits, is_real_example): | |
predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) | |
accuracy = tf.metrics.accuracy( | |
labels=label_ids, predictions=predictions, | |
weights=is_real_example) | |
loss = tf.metrics.mean( | |
values=per_example_loss, weights=is_real_example) | |
return { | |
"eval_accuracy": accuracy, | |
"eval_loss": loss, | |
} | |
eval_metrics = (metric_fn, | |
[per_example_loss, label_ids, logits, is_real_example]) | |
output_spec = contrib_tpu.TPUEstimatorSpec( | |
mode=mode, | |
loss=total_loss, | |
eval_metrics=eval_metrics, | |
scaffold_fn=scaffold_fn) | |
else: | |
output_spec = contrib_tpu.TPUEstimatorSpec( | |
mode=mode, | |
predictions={"probabilities": probabilities, | |
"predictions": predictions}, | |
scaffold_fn=scaffold_fn) | |
return output_spec | |
return model_fn | |