import json import os from argparse import ArgumentParser from tools.framenet.retokenize_fn import load_nltk_exemplars, load_nltk_fully_annotated def main(src_path, dst_path): if src_path is not None: full = json.load(open(os.path.join(src_path, 'full.17.json'))) exe = json.load(open(os.path.join(src_path, 'exe.17.json'))) else: full = load_nltk_fully_annotated('1.7') exe = load_nltk_exemplars('1.7') train, dev, test = full['train'], full['dev'], full['test'] def dump(train_set, path): os.makedirs(path, exist_ok=True) for split, data_set in zip(['train', 'dev', 'test'], [train_set, dev, test]): open(os.path.join(path, split+'.jsonl'), 'w').write('\n'.join(map(json.dumps, data_set))) open(os.path.join(path, 'full.jsonl'), 'w').write('\n'.join(map(json.dumps, train_set+dev+test))) # Full text only dump(train, os.path.join(dst_path, 'full')) # Full test + exemplar dump(train+exe, os.path.join(dst_path, 'full_exe')) if __name__ == '__main__': parser = ArgumentParser() parser.add_argument('dst', metavar='destination') parser.add_argument( '-s', metavar='data', default=None, help='Path to retokenized framenet. If not provided, will re-load.' ) cmd_args = parser.parse_args() main(cmd_args.s, cmd_args.dst)