sociolome / tools /framenet /retokenize_fn.py
Gosse Minnema
Initial commit
05922fb
import gzip
import json
import os
import logging
from argparse import ArgumentParser
from itertools import accumulate
import nltk
import numpy as np
from tools.framenet.nltk_framenet import framenet, framenet15
from tqdm import tqdm
from tools.framenet.fn_util import framenet_split, Sentence
logger = logging.getLogger('fn')
def _load_raw(version):
if version == '1.5':
nltk.download('framenet_v15')
return framenet15
else:
nltk.download('framenet_v17')
return framenet
def one_frame(sentence, ann):
frame_info = {'label': ann.frame.name}
target_list = list()
for start, end in ann.Target:
start, end = sentence.span(start, end)
target_list.extend(list(range(start, end+1)))
assert len(target_list) > 0
frame_info['span'] = [sorted(target_list)[0], sorted(target_list)[-1]]
frame_info['lu'] = ann.LU.name
frame_info['children'] = fes = list()
for start, end, fe_name in ann.FE[0]:
start, end = sentence.span(start, end)
fes.append({'span': [start, end], 'label': fe_name})
return frame_info
def load_nltk_exemplars(version, exclude_ann_ids=None):
exclude_ann_ids = exclude_ann_ids or list()
fn = _load_raw(version)
egs = list()
bar = tqdm()
skipped = 0
try:
for eg in fn.annotations(full_text=False):
if 'Target' not in eg.keys():
# A bug of nltk
continue
if eg.ID in exclude_ann_ids:
skipped += 1
continue
try:
sentence = Sentence(eg.text)
egs.append({
'tokens': list(map(str, sentence.tokens)), 'annotations': [one_frame(sentence, eg)],
'meta': {
'fully_annotated': False,
'source': f'framenet_v{version}',
'with_fe': True,
'type': 'exemplar',
'ann_ids': [eg.ID],
}
})
bar.update()
except:
pass
except:
pass
bar.close()
logger.info(f'Loaded {len(egs)} sentences for framenet v{version} from exemplars. (skipped {skipped} sentences)')
return egs
def load_nltk_fully_annotated(version):
fn = _load_raw(version)
splits = list(framenet_split.keys())
all_containers = {split: [] for split in splits}
for doc in tqdm(fn.docs()):
container = all_containers['train']
for sp in splits:
if doc.filename in framenet_split[sp]:
container = all_containers[sp]
for sent in doc.sentence:
sentence = Sentence(sent.text)
all_frames = list()
ann_ids = []
for ann in sent.annotationSet:
if ann._type == 'posannotationset':
continue
assert ann._type == 'fulltext_annotationset'
if 'Target' not in ann.keys():
logger.warning('Target not found.')
continue
if 'ID' in ann:
ann_ids.append(ann['ID'])
frame_info = one_frame(sentence, ann)
all_frames.append(frame_info)
eg_dict = {
'tokens': list(map(str, sentence.tokens)), 'annotations': all_frames,
'meta': {
'source': f'framenet_v{version}',
'fully_annotated': True,
'with_fe': True,
'type': 'full text',
'sentence ID': sent.ID,
'doc': doc.filename,
'ann_ids': ann_ids
}
}
container.append(eg_dict)
for sp in splits:
logger.info(f'Load {len(all_containers[sp])} for {sp}.')
return all_containers
def load_expanded_fn(path):
raise NotImplementedError
with gzip.open(path, 'rb') as compressed:
lines = compressed.read().decode()
instances = list()
lines = lines.split('\n')
for line in tqdm(lines):
if len(line) != 0:
instances.append(json.loads(line))
logger.info(f'{len(instances)} lines loaded.')
dataset = list()
for instance in tqdm(instances, desc='Processing expanded framenet...'):
for output in instance['outputs']:
ins_dict = dict()
ins_dict['meta'] = {
'source': 'expanded framenet',
'type': 'paraphrase',
'exemplar_id': instance['exemplar_id'],
'annoset_id': instance['annoset_id']
}
words = output['output_string']
text = ' '.join(words)
length_offsets = [0] + list(accumulate(map(len, words)))
start_idx, end_idx = output['output_trigger_offset']
start_idx = length_offsets[start_idx] + start_idx
end_idx = length_offsets[end_idx] + end_idx - 2
sentence = Sentence(text)
ins_dict['text'] = sentence.tokens
ins_dict['pos'] = sentence.pos
ins_dict['tag'] = sentence.tag
ins_dict['frame'] = [{
'name': instance['frame_name'],
'target': list(range(sentence.span(start_idx, end_idx)[0], sentence.span(start_idx, end_idx)[1]+1)),
'lu': output['output_trigger'],
'fe': []
}]
ins_dict['score'] = {
'pbr': np.exp(-output['pbr_score']),
'aligner': output['aligner_score'],
}
ins_dict['with_fe'] = False
ins_dict['fully_annotated'] = False
dataset.append(ins_dict)
logger.info(f'{len(dataset)} sentences loaded.')
return dataset
if __name__ == '__main__':
logging.basicConfig(level='INFO')
arg_parser = ArgumentParser()
arg_parser.add_argument('output', type=str)
arg_parser.add_argument('-v', type=str, default='1.7')
cmd_args = arg_parser.parse_args()
full = load_nltk_fully_annotated(cmd_args.v)
full_ann_ids = list()
for split in ['train', 'dev', 'test']:
for sent in full[split]:
full_ann_ids.extend(sent['meta']['ann_ids'])
exe = load_nltk_exemplars(cmd_args.v, full_ann_ids)
os.makedirs(cmd_args.output, exist_ok=True)
with open(os.path.join(cmd_args.output, 'full.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp:
json.dump(full, fp)
with open(os.path.join(cmd_args.output, 'exe.' + cmd_args.v.replace('.', '') + '.json'), 'w') as fp:
json.dump(exe, fp)