# Copyright 2018 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """ Preprocesses pretrained word embeddings, creates dev sets for tasks without a provided one, and figures out the set of output classes for each task. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import random from base import configure from base import embeddings from base import utils from task_specific.word_level import word_level_data def main(data_dir='./data'): random.seed(0) utils.log("BUILDING WORD VOCABULARY/EMBEDDINGS") for pretrained in ['glove.6B.300d.txt']: config = configure.Config(data_dir=data_dir, for_preprocessing=True, pretrained_embeddings=pretrained, word_embedding_size=300) embeddings.PretrainedEmbeddingLoader(config).build() utils.log("CONSTRUCTING DEV SETS") for task_name in ["chunk"]: # chunking does not come with a provided dev split, so create one by # selecting a random subset of the data config = configure.Config(data_dir=data_dir, for_preprocessing=True) task_data_dir = os.path.join(config.raw_data_topdir, task_name) + '/' train_sentences = word_level_data.TaggedDataLoader( config, task_name, False).get_labeled_sentences("train") random.shuffle(train_sentences) write_sentences(task_data_dir + 'train_subset.txt', train_sentences[1500:]) write_sentences(task_data_dir + 'dev.txt', train_sentences[:1500]) utils.log("WRITING LABEL MAPPINGS") for task_name in ["chunk"]: for i, label_encoding in enumerate(["BIOES"]): config = configure.Config(data_dir=data_dir, for_preprocessing=True, label_encoding=label_encoding) token_level = task_name in ["ccg", "pos", "depparse"] loader = word_level_data.TaggedDataLoader(config, task_name, token_level) if token_level: if i != 0: continue utils.log("WRITING LABEL MAPPING FOR", task_name.upper()) else: utils.log(" Writing label mapping for", task_name.upper(), label_encoding) utils.log(" ", len(loader.label_mapping), "classes") utils.write_cpickle(loader.label_mapping, loader.label_mapping_path) def write_sentences(fname, sentences): with open(fname, 'w') as f: for words, tags in sentences: for word, tag in zip(words, tags): f.write(word + " " + tag + "\n") f.write("\n") if __name__ == '__main__': main()