import csv import json import os from .examples import MultipleChoiceExample, TextExample, TokensExample class DataProcessor: """Base class for data converters for sequence classification data sets.""" def __init__(self, data_dir): self.data_dir = data_dir def get_examples(self, lang, mode): if mode == 'train': return self.get_train_examples(lang) elif mode == 'dev': return self.get_dev_examples(lang) elif mode == 'test': return self.get_test_examples(lang) def modes(self): return ['train', 'dev', 'test'] def get_train_examples(self, lang): """Gets a collection of :class:`InputExample` for the train set.""" raise NotImplementedError() def get_dev_examples(self, lang): """Gets a collection of :class:`InputExample` for the dev set.""" raise NotImplementedError() def get_test_examples(self, lang): """Gets a collection of :class:`InputExample` for the test set.""" raise NotImplementedError() def get_labels(self, lang): """Gets the list of labels for this data set.""" raise NotImplementedError() @classmethod def read_csv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" with open(input_file, encoding='utf-8') as fp: return list(csv.reader(fp, delimiter=',')) @classmethod def read_json(cls, input_file): """Reads a json file file.""" with open(input_file, encoding='utf-8') as fp: return json.load(fp) @classmethod def readlines(cls, filepath): with open(filepath, encoding='utf-8') as fp: return fp.readlines() @classmethod def read_jsonl(cls, filepath): with open(filepath, 'r', encoding='utf-8') as fp: data = fp.readlines() data = list(map(lambda l: json.loads(l), data)) return data class IndicNLPHeadlines(DataProcessor): """Processor for the Headline Predction dataset""" def __init__(self, data_dir): self.data_dir = data_dir def get_train_examples(self, lang): """See base class.""" fname = '{}/{}-train.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'train') def get_dev_examples(self, lang): '''See base class.''' fname = '{}/{}-valid.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'dev') def get_test_examples(self, lang): '''See base class.''' fname = '{}/{}-test.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'test') def get_labels(self, lang): """See base class.""" return ['A', 'B', 'C', 'D'] def _create_examples(self, items, set_type): """Creates examples for the training and dev sets.""" examples = [ MultipleChoiceExample( example_id=idx, question='', contexts=[item['content'], item['content'], item['content'], item['content']], endings=[item['optionA'], item['optionB'], item['optionC'], item['optionD']], label=item['correctOption'], ) for idx, item in enumerate(items) ] return examples class WikiCloze(DataProcessor): """Processor for Wiki Cloze QA dataset""" def __init__(self, data_dir): self.data_dir = data_dir def modes(self): return ['test'] def get_test_examples(self, lang): """See base class.""" fname = '{}.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath)['cloze_data'], 'test') def get_labels(self, lang): """See base class.""" return list(range(4)) def _create_examples(self, items, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, item) in enumerate(items): if '' in [option.strip() for option in item['options']]: continue example = MultipleChoiceExample( example_id=i, question=item['question'].replace('', '[MASK]'), contexts=[], endings=item['options'], label=item['options'].index(item['answer']) ) examples.append(example) return examples class IndicNLPGenre(DataProcessor): """Processor for the Article Genre Classification data set""" def __init__(self, data_dir): self.data_dir = data_dir def get_train_examples(self, lang): """See base class.""" fname = '{}/{}-train.csv'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'train') def get_dev_examples(self, lang): """See base class.""" fname = '{}/{}-valid.csv'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'dev') def get_test_examples(self, lang): fname = '{}/{}-test.csv'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'test') def get_labels(self, lang): """See base class.""" filename = '{}/{}-train.csv'.format(lang, lang) lines = self.read_csv(os.path.join(self.data_dir, filename)) labels = map(lambda l: l[0], lines) labels = list(set(labels)) return labels def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): example = TextExample( guid=('%s-%s' % (set_type, i)), text_a=line[1], label=line[0] ) examples.append(example) return examples class WikiNER(DataProcessor): def __init__(self, data_dir): self.data_dir = data_dir def get_examples(self, lang, mode): mode = 'valid' if mode == 'dev' else mode file_path = os.path.join(self.data_dir, lang, f'{mode}.txt') guid_index = 1 examples = [] with open(file_path, encoding='utf-8') as f: words = [] labels = [] for line in f: if line.startswith('-DOCSTART-') or line == '' or line == '\n': if words: example = TokensExample( guid=f'{mode}-{guid_index}', words=words, labels=labels ) examples.append(example) guid_index += 1 words = [] labels = [] else: splits = line.split(' ') words.append(splits[0]) if len(splits) > 1: labels.append(splits[-1].replace('\n', '')) else: # Examples could have no label for mode = 'test' labels.append('O') if words: example = TokensExample( guid=f'{mode}-{guid_index}', words=words, labels=labels ) examples.append(example) return examples def get_labels(self, lang): path = os.path.join(self.data_dir, lang, 'labels.txt') with open(path, 'r') as f: labels = f.read().splitlines() if 'O' not in labels: labels = ['O'] + labels return labels class WikiSectionTitles(DataProcessor): """Processor for the Wikipedia Section Title Prediction dataset""" def __init__(self, data_dir): self.data_dir = data_dir def get_train_examples(self, lang): """See base class.""" fname = '{}/{}-train.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'train') def get_dev_examples(self, lang): """See base class.""" fname = '{}/{}-valid.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'dev') def get_test_examples(self, lang): """See base class.""" fname = '{}/{}-test.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'test') def get_labels(self, lang): """See base class.""" return ['titleA', 'titleB', 'titleC', 'titleD'] def _create_examples(self, items, set_type): """Creates examples for the training and dev sets.""" examples = [ MultipleChoiceExample( example_id=idx, question='', contexts=[item['sectionText'], item['sectionText'], item['sectionText'], item['sectionText']], endings=[item['titleA'], item['titleB'], item['titleC'], item['titleD']], label=item['correctTitle'], ) for idx, item in enumerate(items) ] return examples class ManKiBaat(DataProcessor): """Processor for Man ki Baat dataset""" def __init__(self, data_dir): self.data_dir = data_dir def modes(self): return ['en', 'in'] def get_examples(self, lang, mode): if mode == 'en': return self.get_examples_en(lang) elif mode == 'in': return self.get_examples_in(lang) def get_examples_en(self, lang): """Get examples of English language""" fname = 'en-{}/mkb.en'.format(lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.readlines(fpath), 'en') def get_examples_in(self, lang): """Get examples of the Indian language""" fname = 'en-{}/mkb.{}'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.readlines(fpath), 'in') def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): example = TextExample( guid=('%s-%s' % (set_type, i)), text_a=line, label=i ) examples.append(example) return examples def get_labels(self, lang): # return dummy value greater than number of examples return list(range(10000)) class ACTSA(IndicNLPGenre): pass class BBCNews(IndicNLPGenre): def get_dev_examples(self, lang): """See base class.""" fname = '{}/{}-test.csv'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'dev') class INLTKHeadlines(IndicNLPGenre): pass class SohamArticles(IndicNLPGenre): pass class IITPMovies(IndicNLPGenre): pass class IITProducts(IndicNLPGenre): pass class AmritaParaphraseExact(IndicNLPGenre): def get_dev_examples(self, lang): """See base class.""" fname = '{}/{}-test.csv'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'dev') def get_labels(self, lang): """See base class.""" filename = '{}/{}-train.csv'.format(lang, lang) lines = self.read_csv(os.path.join(self.data_dir, filename)) labels = map(lambda l: l[2], lines) labels = list(set(labels)) return labels def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): example = TextExample( guid=('%s-%s' % (set_type, i)), text_a=line[0], text_b=line[1], label=line[2] ) examples.append(example) return examples class AmritaParaphraseFuzzy(AmritaParaphraseExact): pass class MidasDiscourse(DataProcessor): """Processor for the Article Genre Classification data set""" def __init__(self, data_dir): self.data_dir = data_dir def get_train_examples(self, lang): """See base class.""" fname = '{}/train.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'train') def get_dev_examples(self, lang): """See base class.""" fname = '{}/val.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'dev') def get_test_examples(self, lang): fname = '{}/test.json'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_json(fpath), 'test') def get_labels(self, lang): """See base class.""" filename = '{}/train.json'.format(lang, lang) lines = self.read_json(os.path.join(self.data_dir, filename)) labels = map(lambda l: l['Discourse Mode'], lines) labels = list(set(labels)) return labels def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): example = TextExample( guid=('%s-%s' % (set_type, i)), text_a=line['Sentence'], label=line['Discourse Mode'] ) examples.append(example) return examples class WNLI(DataProcessor): """Processor for the WNLI data set (GLUE version).""" def __init__(self, data_dir): self.data_dir = data_dir def get_train_examples(self, lang): """See base class.""" fname = '{}/train.csv'.format(lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'train') def get_dev_examples(self, lang): """See base class.""" fname = '{}/dev.csv'.format(lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'dev') def get_test_examples(self, lang): """See base class.""" fname = '{}/dev.csv'.format(lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_csv(fpath), 'test') def get_labels(self, lang): """See base class.""" return ['0', '1'] def _create_examples(self, lines, set_type): """Creates examples for the training, dev and test sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, line[0]) text_a = line[1] text_b = line[2] label = line[-1] examples.append(TextExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) return examples class COPA(DataProcessor): """Processor for the Wikipedia Section Title Prediction dataset""" def __init__(self, data_dir): self.data_dir = data_dir def get_train_examples(self, lang): """See base class.""" fname = '{}/train.jsonl'.format(lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_jsonl(fpath), 'train') def get_dev_examples(self, lang): """See base class.""" fname = '{}/val.jsonl'.format(lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_jsonl(fpath), 'dev') def get_test_examples(self, lang): """See base class.""" fname = '{}/val.jsonl'.format(lang, lang) fpath = os.path.join(self.data_dir, fname) return self._create_examples(self.read_jsonl(fpath), 'test') def get_labels(self, lang): """See base class.""" return [0, 1] def _create_examples(self, items, set_type): """Creates examples for the training and dev sets.""" examples = [ MultipleChoiceExample( example_id=idx, question='', contexts=[item['premise'], item['premise']], endings=[item['choice1'], item['choice2']], label=item['label'], ) for idx, item in enumerate(items) ] return examples