Spaces:
Build error
Build error
import re | |
from copy import deepcopy | |
import transformers | |
from transformers import BartTokenizer | |
import jsonlines | |
import json | |
# dict = {"rel_triggers": [], "gold_rel_links": [], "doc_key": "nw_RC00c8620ef5810429342a1c339e6c76c1b0b9add3f6010f04482fd832", "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], [48, 48, [["evt043arg03place", 1.0]]], [32, 36, [["evt043arg02recipient", 1.0]]]], "language_id": "eng", "source_url": "http://bbc.co.uk/sport/athletics/36295481", "evt_triggers": [[31, 31, [["contact.prevarication.broadcast", 1.0]]]], "split": "test", "sentences": [["We", "are", "ashamed", "of", "them", ".", "\""], ["However", ",", "Mutko", "stopped", "short", "of", "admitting", "the", "doping", "scandal", "was", "state", "sponsored", "."], ["\"", "We", "are", "very", "sorry", "that", "athletes", "who", "tried", "to", "deceive", "us", ",", "and", "the", "world", ",", "were", "not", "caught", "sooner", "."], ["We", "are", "very", "sorry", "because", "Russia", "is", "committed", "to", "upholding", "the", "highest", "standards", "in", "sport", "and", "is", "opposed", "to", "anything", "that", "threatens", "the", "Olympic", "values", ",", "\"", "he", "said", "."], ["English", "former", "heptathlete", "and", "Athens", "2004", "bronze", "medallist", "Kelly", "Sotherton", "was", "unhappy", "with", "Mutko", "'s", "plea", "for", "Russia", "'s", "ban", "to", "be", "lifted", "for", "Rio"]], "gold_evt_links": [[[31, 31], [27, 27], "evt043arg01communicator"], [[31, 31], [32, 32], "evt043arg02recipient"], [[31, 31], [48, 48], "evt043arg03place"]], "clusters": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 91]], [[9, 9], [70, 70], [86, 87]]], "corefs": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 90]], [[9, 9], [70, 70], [86, 86]]]} | |
# | |
# template = "what is the <arg>" | |
# | |
# context_words = [w for sent in dict['sentences'] for w in sent] | |
# | |
# argtext = context_words[27] | |
# | |
# print(argtext) | |
# | |
# template = re.sub('<arg>', argtext, template) | |
# | |
# print(template) | |
# for lidx, triple in enumerate(dict['gold_evt_links']): | |
# # 触发词 论元 论元 | |
# # 例子: "gold_evt_links": | |
# # [[[40, 40], [33, 33], "evt089arg01victim"], | |
# # [[40, 40], [28, 28], "evt089arg02place"]] | |
# trigger_span, argument_span, arg_name = triple | |
# # 第几个论元 | |
# arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name] | |
# # 具体论元内容 短语 | |
# arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1]) | |
# # 通过正则表达式的方式将模板中的每个<arg> 替换为具体的论元内容 | |
# # 按照顺序将列表中的<arg>依次替换为 | |
# template[lidx] = re.sub('<{}>'.format(arg_num), arg_text, template[lidx]) | |
# # "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], | |
# # [48, 48, [["evt043arg03place", 1.0]]], | |
# # [32, 36, [["evt043arg02recipient", 1.0]]]] | |
# context_words = [w for sent in dict['sentences'] for w in sent] | |
# | |
# print(context_words[32]) | |
# print(context_words[33]) | |
# print(context_words[34]) | |
# print(context_words[35]) | |
# print(context_words[36]) | |
def get_event_type(ex): | |
evt_type = [] | |
for evt in ex['evt_triggers']: | |
for t in evt[2]: | |
evt_type.append(t[0]) | |
return evt_type | |
def create_gold_gen(ex, ontology_dict, mark_trigger=True): | |
# 设置三个总列表、存放输入模板、输出模板 | |
# 设置三个总列表、存放输入模板、输出模板 | |
INPUT = [] | |
OUTPUT = [] | |
CONTEXT = [] | |
evt_type = get_event_type(ex)[0] | |
context_words = [w for sent in ex['sentences'] for w in sent] | |
# print(context_words[48]) | |
input_template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template'] | |
i = len(input_template) | |
input_list = [] | |
for x in range(i): | |
str = re.sub(r'<arg\d>', '<arg>', input_template[x]) | |
input_list.append(str) | |
# 其中input_list种存放的是 原始数据中<arg1> 全部替换为 <arg> 之后的模板 下一步应该进行分词 | |
# temp = [] | |
for x in range(i): | |
space_tokenized_template = input_list[x].split(' ') | |
INPUT.append(space_tokenized_template) | |
space_tokenized_template = [] | |
# 其中temp中存放的都是分词后的模板 下一步对temp中的所有元素进行tokenize | |
tokenized_input_template = [] | |
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') | |
temp = [] | |
for x in range(len(INPUT)): | |
for w in INPUT[x]: | |
tokenized_input_template.extend(tokenizer.tokenize(w, add_prefix_space=True)) | |
# print(tokenized_input_template) | |
temp.append(tokenized_input_template) | |
tokenized_input_template = [] | |
print(temp) | |
break | |
template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template'] | |
# if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df': | |
# print('bbb') | |
# ['<arg1> was injured by injurer with medical issue at place', | |
# 'Victim was injured by <arg2> with medical issue at place', | |
# 'Victim was injured by injurer with medical issue at <arg3> place', | |
# 'Victim was injured by injurer with <arg4> medical issue at place'] | |
# print(template) | |
for lidx, triple in enumerate(ex['gold_evt_links']): | |
# 触发词 论元 论元 | |
# 例子: "gold_evt_links": | |
# [[50, 50], [48, 48], 'evt092arg01victim'] | |
# [[50, 50], [7, 7], 'evt092arg03place'] | |
trigger_span, argument_span, arg_name = triple | |
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': | |
print(len(ex['gold_evt_links'])) | |
# 第几个论元 | |
arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name] | |
# 具体论元内容 短语 | |
arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1]) | |
# if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df': | |
# print(arg_num) | |
# print(arg_text) | |
# print(arg_text) | |
# 通过正则表达式的方式将模板中的每个<arg> 替换为具体的论元内容 | |
# 搜索templat中的arg_num 找到对应的序列 | |
# INDEX = 0 | |
for index in range(len(template)): | |
if arg_num in template[index]: | |
break | |
else: | |
continue | |
# INDEX += 1 | |
template[index] = re.sub('<{}>'.format(arg_num), arg_text, template[index]) | |
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': | |
print('aaa') | |
print(template) | |
trigger = ex['evt_triggers'][0] | |
# 将context放入CONTEXT中 | |
for w in range(i): | |
CONTEXT.append(context_words) | |
output_template = [] | |
# 此时的template中已经全部替换为论元短语 这部是将<arg1> 替换为<arg> | |
for i in range(len(template)): | |
output_template.append(re.sub(r'<arg\d>', '<arg>', template[i])) | |
# spaceout_tokenized_template = [] | |
for i in range(len(output_template)): | |
OUTPUT.append(output_template[i].split(' ')) | |
# tokenized_out_template = [] | |
# for i in range(len(spaceout_tokenized_template)): | |
# for w in spaceout_tokenized_template[i]: | |
# tokenized_out_template.extend(self.tokenizer.tokenize(w, add_prefix_space=True)) | |
# OUTPUT.append(tokenized_out_template) | |
# tokenized_out_template = [] | |
return INPUT, OUTPUT, CONTEXT | |
def load_ontology(): | |
ontology_dict = {} | |
with open('aida_ontology_fj-5.csv', 'r') as f: | |
for lidx, line in enumerate(f): | |
if lidx == 0: # header | |
continue | |
fields = line.strip().split(',') | |
if len(fields) < 2: | |
break | |
evt_type = fields[0] | |
if evt_type in ontology_dict.keys(): | |
# 得到该事件类型下的所有论元类型 | |
args = fields[2:] | |
# 将该事件类型对应的模板中的论元模板 填充到onto_logy字典中 | |
ontology_dict[evt_type]['template'].append(fields[1]) | |
for i, arg in enumerate(args): | |
if arg != '': | |
# 事件类型下添加字典一项 arg1的值为arg | |
ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg | |
ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1) | |
# 即扫描到的事件类型在 evt_type_dict.keys() 还未存在过 | |
else: | |
# 建立该事件类型的key | |
ontology_dict[evt_type] = {} | |
args = fields[2:] | |
ontology_dict[evt_type]['template'] = [] | |
ontology_dict[evt_type]['template'].append(fields[1]) | |
for i, arg in enumerate(args): | |
if arg != '': | |
# 事件类型下添加字典一项 arg1的值为arg | |
ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg | |
ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1) | |
return ontology_dict | |
def prepare_data(): | |
ontology_dict = load_ontology() | |
# ('train', './data/RAMS_1.0/data/train.jsonlines'), | |
# ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines') | |
for split, f in [('val', './data/RAMS_1.0/data/dev.jsonlines'),('train', './data/RAMS_1.0/data/train.jsonlines'), | |
('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')]: | |
# , open('head_templates_preprocessed_data/{}.jsonl'.format(split), 'w') as writer | |
with open(f, 'r') as reader: | |
# print(ontology_dict['contact.prevarication.broadcast']) | |
for lidx, line in enumerate(reader): | |
ex = json.loads(line.strip()) | |
# print(lidx) | |
# print(ex) | |
event_type = get_event_type(ex)[0] | |
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': | |
# {'rel_triggers': [], 'gold_rel_links': [], | |
# 'doc_key': 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df', | |
# 'ent_spans': [[48, 48, [['evt092arg01victim', 1.0]]], [7, 7, [['evt092arg03place', 1.0]]]], | |
# 'language_id': 'eng', | |
# 'source_url': 'http://news.sky.com/story/attack-in-nice-truck-ploughes-into-crowd-10502068', | |
# 'evt_triggers': [[50, 50, [['life.injure.n/a', 1.0]]]], 'split': 'test', | |
# 'sentences': [[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'], | |
# ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', | |
# 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', | |
# 'to', 'avoid', 'the', 'truck', '.'], | |
# ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', | |
# 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', | |
# 'dead', 'in', 'the', 'road', '.', '"'], | |
# ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'], | |
# ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', | |
# 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', | |
# 'dead', ',', 'a', 'US', 'official', 'said', '.']], | |
# 'gold_evt_links': [[[50, 50], [48, 48], 'evt092arg01victim'], | |
# [[50, 50], [7, 7], 'evt092arg03place']], | |
# 'clusters': [[[10, 11], [15, 15], [37, 37]], [[70, 71], [73, 73]]], | |
# 'corefs': [[[11, 11], [15, 15], [37, 37]], [[71, 71], [73, 73]]]} | |
print(ex) | |
# {'template': ['<arg1> was injured by injurer with medical issue at place', | |
# 'Victim was injured by <arg2> with medical issue at place', | |
# 'Victim was injured by injurer with medical issue at <arg3> place', | |
# 'Victim was injured by injurer with <arg4> medical issue at place'], | |
# 'arg1': 'evt092arg01victim', 'evt092arg01victim': 'arg1', 'arg2': 'evt092arg02injurer', | |
# 'evt092arg02injurer': 'arg2', 'arg3': 'evt092arg03place', 'evt092arg03place': 'arg3', | |
# 'arg4': 'evt092arg04medicalissue', 'evt092arg04medicalissue': 'arg4'} | |
print(ontology_dict[event_type.replace('n/a','unspecified')]) | |
input_template, output_template, context = create_gold_gen(ex, ontology_dict, | |
True) | |
ontology_dict = load_ontology() | |
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': | |
# [['<arg>', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'], | |
# ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'], | |
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'], | |
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']] | |
print(input_template) | |
# [['people', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'], | |
# ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'], | |
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'], | |
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']] | |
print(output_template) | |
# print(input_template) | |
# 4 4 4 | |
# print(len(input_template)) | |
# print(len(output_template)) | |
# print(len(context)) | |
[[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'], | |
['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.'], | |
['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"'], | |
['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'], | |
['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']] | |
[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France', | |
'One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.', | |
'Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"', | |
'Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack', | |
'Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.'] | |
prepare_data() | |