Bart-gen-arg / TEST /ytest.py
adherent's picture
new
44a9d55
raw
history blame
16.1 kB
import re
from copy import deepcopy
import transformers
from transformers import BartTokenizer
import jsonlines
import json
# dict = {"rel_triggers": [], "gold_rel_links": [], "doc_key": "nw_RC00c8620ef5810429342a1c339e6c76c1b0b9add3f6010f04482fd832", "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], [48, 48, [["evt043arg03place", 1.0]]], [32, 36, [["evt043arg02recipient", 1.0]]]], "language_id": "eng", "source_url": "http://bbc.co.uk/sport/athletics/36295481", "evt_triggers": [[31, 31, [["contact.prevarication.broadcast", 1.0]]]], "split": "test", "sentences": [["We", "are", "ashamed", "of", "them", ".", "\""], ["However", ",", "Mutko", "stopped", "short", "of", "admitting", "the", "doping", "scandal", "was", "state", "sponsored", "."], ["\"", "We", "are", "very", "sorry", "that", "athletes", "who", "tried", "to", "deceive", "us", ",", "and", "the", "world", ",", "were", "not", "caught", "sooner", "."], ["We", "are", "very", "sorry", "because", "Russia", "is", "committed", "to", "upholding", "the", "highest", "standards", "in", "sport", "and", "is", "opposed", "to", "anything", "that", "threatens", "the", "Olympic", "values", ",", "\"", "he", "said", "."], ["English", "former", "heptathlete", "and", "Athens", "2004", "bronze", "medallist", "Kelly", "Sotherton", "was", "unhappy", "with", "Mutko", "'s", "plea", "for", "Russia", "'s", "ban", "to", "be", "lifted", "for", "Rio"]], "gold_evt_links": [[[31, 31], [27, 27], "evt043arg01communicator"], [[31, 31], [32, 32], "evt043arg02recipient"], [[31, 31], [48, 48], "evt043arg03place"]], "clusters": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 91]], [[9, 9], [70, 70], [86, 87]]], "corefs": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 90]], [[9, 9], [70, 70], [86, 86]]]}
#
# template = "what is the <arg>"
#
# context_words = [w for sent in dict['sentences'] for w in sent]
#
# argtext = context_words[27]
#
# print(argtext)
#
# template = re.sub('<arg>', argtext, template)
#
# print(template)
# for lidx, triple in enumerate(dict['gold_evt_links']):
# # 触发词 论元 论元
# # 例子: "gold_evt_links":
# # [[[40, 40], [33, 33], "evt089arg01victim"],
# # [[40, 40], [28, 28], "evt089arg02place"]]
# trigger_span, argument_span, arg_name = triple
# # 第几个论元
# arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name]
# # 具体论元内容 短语
# arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1])
# # 通过正则表达式的方式将模板中的每个<arg> 替换为具体的论元内容
# # 按照顺序将列表中的<arg>依次替换为
# template[lidx] = re.sub('<{}>'.format(arg_num), arg_text, template[lidx])
# # "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]],
# # [48, 48, [["evt043arg03place", 1.0]]],
# # [32, 36, [["evt043arg02recipient", 1.0]]]]
# context_words = [w for sent in dict['sentences'] for w in sent]
#
# print(context_words[32])
# print(context_words[33])
# print(context_words[34])
# print(context_words[35])
# print(context_words[36])
def get_event_type(ex):
evt_type = []
for evt in ex['evt_triggers']:
for t in evt[2]:
evt_type.append(t[0])
return evt_type
def create_gold_gen(ex, ontology_dict, mark_trigger=True):
# 设置三个总列表、存放输入模板、输出模板
# 设置三个总列表、存放输入模板、输出模板
INPUT = []
OUTPUT = []
CONTEXT = []
evt_type = get_event_type(ex)[0]
context_words = [w for sent in ex['sentences'] for w in sent]
# print(context_words[48])
input_template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template']
i = len(input_template)
input_list = []
for x in range(i):
str = re.sub(r'<arg\d>', '<arg>', input_template[x])
input_list.append(str)
# 其中input_list种存放的是 原始数据中<arg1> 全部替换为 <arg> 之后的模板 下一步应该进行分词
# temp = []
for x in range(i):
space_tokenized_template = input_list[x].split(' ')
INPUT.append(space_tokenized_template)
space_tokenized_template = []
# 其中temp中存放的都是分词后的模板 下一步对temp中的所有元素进行tokenize
tokenized_input_template = []
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
temp = []
for x in range(len(INPUT)):
for w in INPUT[x]:
tokenized_input_template.extend(tokenizer.tokenize(w, add_prefix_space=True))
# print(tokenized_input_template)
temp.append(tokenized_input_template)
tokenized_input_template = []
print(temp)
break
template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template']
# if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df':
# print('bbb')
# ['<arg1> was injured by injurer with medical issue at place',
# 'Victim was injured by <arg2> with medical issue at place',
# 'Victim was injured by injurer with medical issue at <arg3> place',
# 'Victim was injured by injurer with <arg4> medical issue at place']
# print(template)
for lidx, triple in enumerate(ex['gold_evt_links']):
# 触发词 论元 论元
# 例子: "gold_evt_links":
# [[50, 50], [48, 48], 'evt092arg01victim']
# [[50, 50], [7, 7], 'evt092arg03place']
trigger_span, argument_span, arg_name = triple
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
print(len(ex['gold_evt_links']))
# 第几个论元
arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name]
# 具体论元内容 短语
arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1])
# if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df':
# print(arg_num)
# print(arg_text)
# print(arg_text)
# 通过正则表达式的方式将模板中的每个<arg> 替换为具体的论元内容
# 搜索templat中的arg_num 找到对应的序列
# INDEX = 0
for index in range(len(template)):
if arg_num in template[index]:
break
else:
continue
# INDEX += 1
template[index] = re.sub('<{}>'.format(arg_num), arg_text, template[index])
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
print('aaa')
print(template)
trigger = ex['evt_triggers'][0]
# 将context放入CONTEXT中
for w in range(i):
CONTEXT.append(context_words)
output_template = []
# 此时的template中已经全部替换为论元短语 这部是将<arg1> 替换为<arg>
for i in range(len(template)):
output_template.append(re.sub(r'<arg\d>', '<arg>', template[i]))
# spaceout_tokenized_template = []
for i in range(len(output_template)):
OUTPUT.append(output_template[i].split(' '))
# tokenized_out_template = []
# for i in range(len(spaceout_tokenized_template)):
# for w in spaceout_tokenized_template[i]:
# tokenized_out_template.extend(self.tokenizer.tokenize(w, add_prefix_space=True))
# OUTPUT.append(tokenized_out_template)
# tokenized_out_template = []
return INPUT, OUTPUT, CONTEXT
def load_ontology():
ontology_dict = {}
with open('aida_ontology_fj-5.csv', 'r') as f:
for lidx, line in enumerate(f):
if lidx == 0: # header
continue
fields = line.strip().split(',')
if len(fields) < 2:
break
evt_type = fields[0]
if evt_type in ontology_dict.keys():
# 得到该事件类型下的所有论元类型
args = fields[2:]
# 将该事件类型对应的模板中的论元模板 填充到onto_logy字典中
ontology_dict[evt_type]['template'].append(fields[1])
for i, arg in enumerate(args):
if arg != '':
# 事件类型下添加字典一项 arg1的值为arg
ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg
ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1)
# 即扫描到的事件类型在 evt_type_dict.keys() 还未存在过
else:
# 建立该事件类型的key
ontology_dict[evt_type] = {}
args = fields[2:]
ontology_dict[evt_type]['template'] = []
ontology_dict[evt_type]['template'].append(fields[1])
for i, arg in enumerate(args):
if arg != '':
# 事件类型下添加字典一项 arg1的值为arg
ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg
ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1)
return ontology_dict
def prepare_data():
ontology_dict = load_ontology()
# ('train', './data/RAMS_1.0/data/train.jsonlines'),
# ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')
for split, f in [('val', './data/RAMS_1.0/data/dev.jsonlines'),('train', './data/RAMS_1.0/data/train.jsonlines'),
('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')]:
# , open('head_templates_preprocessed_data/{}.jsonl'.format(split), 'w') as writer
with open(f, 'r') as reader:
# print(ontology_dict['contact.prevarication.broadcast'])
for lidx, line in enumerate(reader):
ex = json.loads(line.strip())
# print(lidx)
# print(ex)
event_type = get_event_type(ex)[0]
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
# {'rel_triggers': [], 'gold_rel_links': [],
# 'doc_key': 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df',
# 'ent_spans': [[48, 48, [['evt092arg01victim', 1.0]]], [7, 7, [['evt092arg03place', 1.0]]]],
# 'language_id': 'eng',
# 'source_url': 'http://news.sky.com/story/attack-in-nice-truck-ploughes-into-crowd-10502068',
# 'evt_triggers': [[50, 50, [['life.injure.n/a', 1.0]]]], 'split': 'test',
# 'sentences': [[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'],
# ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and',
# 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below',
# 'to', 'avoid', 'the', 'truck', '.'],
# ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There',
# 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or',
# 'dead', 'in', 'the', 'road', '.', '"'],
# ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'],
# ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old',
# 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the',
# 'dead', ',', 'a', 'US', 'official', 'said', '.']],
# 'gold_evt_links': [[[50, 50], [48, 48], 'evt092arg01victim'],
# [[50, 50], [7, 7], 'evt092arg03place']],
# 'clusters': [[[10, 11], [15, 15], [37, 37]], [[70, 71], [73, 73]]],
# 'corefs': [[[11, 11], [15, 15], [37, 37]], [[71, 71], [73, 73]]]}
print(ex)
# {'template': ['<arg1> was injured by injurer with medical issue at place',
# 'Victim was injured by <arg2> with medical issue at place',
# 'Victim was injured by injurer with medical issue at <arg3> place',
# 'Victim was injured by injurer with <arg4> medical issue at place'],
# 'arg1': 'evt092arg01victim', 'evt092arg01victim': 'arg1', 'arg2': 'evt092arg02injurer',
# 'evt092arg02injurer': 'arg2', 'arg3': 'evt092arg03place', 'evt092arg03place': 'arg3',
# 'arg4': 'evt092arg04medicalissue', 'evt092arg04medicalissue': 'arg4'}
print(ontology_dict[event_type.replace('n/a','unspecified')])
input_template, output_template, context = create_gold_gen(ex, ontology_dict,
True)
ontology_dict = load_ontology()
if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
# [['<arg>', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'],
# ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'],
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'],
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']]
print(input_template)
# [['people', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'],
# ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'],
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'],
# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']]
print(output_template)
# print(input_template)
# 4 4 4
# print(len(input_template))
# print(len(output_template))
# print(len(context))
[[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'],
['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.'],
['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"'],
['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'],
['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']]
[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France',
'One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.',
'Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"',
'Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack',
'Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']
prepare_data()