Spaces:

adherent
/

Bart-gen-arg

Build error

App Files Files Community

Bart-gen-arg / TEST /ytest.py

adherent

new

44a9d55 about 2 years ago

raw

history blame

16.1 kB

	import re
	from copy import deepcopy
	import transformers
	from transformers import BartTokenizer
	import jsonlines
	import json


	# dict = {"rel_triggers": [], "gold_rel_links": [], "doc_key": "nw_RC00c8620ef5810429342a1c339e6c76c1b0b9add3f6010f04482fd832", "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], [48, 48, [["evt043arg03place", 1.0]]], [32, 36, [["evt043arg02recipient", 1.0]]]], "language_id": "eng", "source_url": "http://bbc.co.uk/sport/athletics/36295481", "evt_triggers": [[31, 31, [["contact.prevarication.broadcast", 1.0]]]], "split": "test", "sentences": [["We", "are", "ashamed", "of", "them", ".", "\""], ["However", ",", "Mutko", "stopped", "short", "of", "admitting", "the", "doping", "scandal", "was", "state", "sponsored", "."], ["\"", "We", "are", "very", "sorry", "that", "athletes", "who", "tried", "to", "deceive", "us", ",", "and", "the", "world", ",", "were", "not", "caught", "sooner", "."], ["We", "are", "very", "sorry", "because", "Russia", "is", "committed", "to", "upholding", "the", "highest", "standards", "in", "sport", "and", "is", "opposed", "to", "anything", "that", "threatens", "the", "Olympic", "values", ",", "\"", "he", "said", "."], ["English", "former", "heptathlete", "and", "Athens", "2004", "bronze", "medallist", "Kelly", "Sotherton", "was", "unhappy", "with", "Mutko", "'s", "plea", "for", "Russia", "'s", "ban", "to", "be", "lifted", "for", "Rio"]], "gold_evt_links": [[[31, 31], [27, 27], "evt043arg01communicator"], [[31, 31], [32, 32], "evt043arg02recipient"], [[31, 31], [48, 48], "evt043arg03place"]], "clusters": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 91]], [[9, 9], [70, 70], [86, 87]]], "corefs": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 90]], [[9, 9], [70, 70], [86, 86]]]}
	#
	# template = "what is the <arg>"
	#
	# context_words = [w for sent in dict['sentences'] for w in sent]
	#
	# argtext = context_words[27]
	#
	# print(argtext)
	#
	# template = re.sub('<arg>', argtext, template)
	#
	# print(template)

	# for lidx, triple in enumerate(dict['gold_evt_links']):
	# # 触发词论元论元
	# # 例子： "gold_evt_links":
	# # [[[40, 40], [33, 33], "evt089arg01victim"],
	# # [[40, 40], [28, 28], "evt089arg02place"]]
	# trigger_span, argument_span, arg_name = triple
	# # 第几个论元
	# arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name]
	# # 具体论元内容短语
	# arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1])
	# # 通过正则表达式的方式将模板中的每个<arg> 替换为具体的论元内容
	# # 按照顺序将列表中的<arg>依次替换为
	# template[lidx] = re.sub('<{}>'.format(arg_num), arg_text, template[lidx])
	# # "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]],
	# # [48, 48, [["evt043arg03place", 1.0]]],
	# # [32, 36, [["evt043arg02recipient", 1.0]]]]
	# context_words = [w for sent in dict['sentences'] for w in sent]
	#
	# print(context_words[32])
	# print(context_words[33])
	# print(context_words[34])
	# print(context_words[35])
	# print(context_words[36])

	def get_event_type(ex):
	evt_type = []
	for evt in ex['evt_triggers']:
	for t in evt[2]:
	evt_type.append(t[0])
	return evt_type

	def create_gold_gen(ex, ontology_dict, mark_trigger=True):
	# 设置三个总列表、存放输入模板、输出模板
	# 设置三个总列表、存放输入模板、输出模板
	INPUT = []
	OUTPUT = []
	CONTEXT = []
	evt_type = get_event_type(ex)[0]
	context_words = [w for sent in ex['sentences'] for w in sent]
	# print(context_words[48])
	input_template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template']
	i = len(input_template)
	input_list = []
	for x in range(i):
	str = re.sub(r'<arg\d>', '<arg>', input_template[x])
	input_list.append(str)
	# 其中input_list种存放的是原始数据中<arg1> 全部替换为 <arg> 之后的模板下一步应该进行分词
	# temp = []
	for x in range(i):
	space_tokenized_template = input_list[x].split(' ')
	INPUT.append(space_tokenized_template)
	space_tokenized_template = []
	# 其中temp中存放的都是分词后的模板下一步对temp中的所有元素进行tokenize
	tokenized_input_template = []
	tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
	temp = []
	for x in range(len(INPUT)):
	for w in INPUT[x]:
	tokenized_input_template.extend(tokenizer.tokenize(w, add_prefix_space=True))

	# print(tokenized_input_template)

	temp.append(tokenized_input_template)
	tokenized_input_template = []
	print(temp)
	break
	template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template']

	# if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df':
	# print('bbb')
	# ['<arg1> was injured by injurer with medical issue at place',
	# 'Victim was injured by <arg2> with medical issue at place',
	# 'Victim was injured by injurer with medical issue at <arg3> place',
	# 'Victim was injured by injurer with <arg4> medical issue at place']
	# print(template)
	for lidx, triple in enumerate(ex['gold_evt_links']):
	# 触发词论元论元
	# 例子： "gold_evt_links":
	# [[50, 50], [48, 48], 'evt092arg01victim']
	# [[50, 50], [7, 7], 'evt092arg03place']

	trigger_span, argument_span, arg_name = triple
	if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
	print(len(ex['gold_evt_links']))
	# 第几个论元
	arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name]
	# 具体论元内容短语
	arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1])
	# if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df':
	# print(arg_num)
	# print(arg_text)
	# print(arg_text)
	# 通过正则表达式的方式将模板中的每个<arg> 替换为具体的论元内容
	# 搜索templat中的arg_num 找到对应的序列
	# INDEX = 0
	for index in range(len(template)):
	if arg_num in template[index]:
	break
	else:
	continue
	# INDEX += 1
	template[index] = re.sub('<{}>'.format(arg_num), arg_text, template[index])
	if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
	print('aaa')
	print(template)
	trigger = ex['evt_triggers'][0]

	# 将context放入CONTEXT中
	for w in range(i):
	CONTEXT.append(context_words)
	output_template = []
	# 此时的template中已经全部替换为论元短语这部是将<arg1> 替换为<arg>
	for i in range(len(template)):
	output_template.append(re.sub(r'<arg\d>', '<arg>', template[i]))
	# spaceout_tokenized_template = []
	for i in range(len(output_template)):
	OUTPUT.append(output_template[i].split(' '))

	# tokenized_out_template = []
	# for i in range(len(spaceout_tokenized_template)):
	# for w in spaceout_tokenized_template[i]:
	# tokenized_out_template.extend(self.tokenizer.tokenize(w, add_prefix_space=True))
	# OUTPUT.append(tokenized_out_template)
	# tokenized_out_template = []

	return INPUT, OUTPUT, CONTEXT


	def load_ontology():
	ontology_dict = {}
	with open('aida_ontology_fj-5.csv', 'r') as f:
	for lidx, line in enumerate(f):
	if lidx == 0: # header
	continue
	fields = line.strip().split(',')
	if len(fields) < 2:
	break
	evt_type = fields[0]
	if evt_type in ontology_dict.keys():
	# 得到该事件类型下的所有论元类型
	args = fields[2:]
	# 将该事件类型对应的模板中的论元模板填充到onto_logy字典中
	ontology_dict[evt_type]['template'].append(fields[1])
	for i, arg in enumerate(args):
	if arg != '':
	# 事件类型下添加字典一项 arg1的值为arg
	ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg
	ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1)
	# 即扫描到的事件类型在 evt_type_dict.keys() 还未存在过
	else:
	# 建立该事件类型的key
	ontology_dict[evt_type] = {}
	args = fields[2:]
	ontology_dict[evt_type]['template'] = []
	ontology_dict[evt_type]['template'].append(fields[1])
	for i, arg in enumerate(args):
	if arg != '':
	# 事件类型下添加字典一项 arg1的值为arg
	ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg
	ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1)

	return ontology_dict

	def prepare_data():

	ontology_dict = load_ontology()
	# ('train', './data/RAMS_1.0/data/train.jsonlines'),
	# ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')
	for split, f in [('val', './data/RAMS_1.0/data/dev.jsonlines'),('train', './data/RAMS_1.0/data/train.jsonlines'),
	('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')]:
	# , open('head_templates_preprocessed_data/{}.jsonl'.format(split), 'w') as writer
	with open(f, 'r') as reader:

	# print(ontology_dict['contact.prevarication.broadcast'])
	for lidx, line in enumerate(reader):
	ex = json.loads(line.strip())
	# print(lidx)
	# print(ex)
	event_type = get_event_type(ex)[0]
	if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
	# {'rel_triggers': [], 'gold_rel_links': [],
	# 'doc_key': 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df',
	# 'ent_spans': [[48, 48, [['evt092arg01victim', 1.0]]], [7, 7, [['evt092arg03place', 1.0]]]],
	# 'language_id': 'eng',
	# 'source_url': 'http://news.sky.com/story/attack-in-nice-truck-ploughes-into-crowd-10502068',
	# 'evt_triggers': [[50, 50, [['life.injure.n/a', 1.0]]]], 'split': 'test',
	# 'sentences': [[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'],
	# ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and',
	# 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below',
	# 'to', 'avoid', 'the', 'truck', '.'],
	# ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There',
	# 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or',
	# 'dead', 'in', 'the', 'road', '.', '"'],
	# ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'],
	# ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old',
	# 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the',
	# 'dead', ',', 'a', 'US', 'official', 'said', '.']],
	# 'gold_evt_links': [[[50, 50], [48, 48], 'evt092arg01victim'],
	# [[50, 50], [7, 7], 'evt092arg03place']],
	# 'clusters': [[[10, 11], [15, 15], [37, 37]], [[70, 71], [73, 73]]],
	# 'corefs': [[[11, 11], [15, 15], [37, 37]], [[71, 71], [73, 73]]]}
	print(ex)
	# {'template': ['<arg1> was injured by injurer with medical issue at place',
	# 'Victim was injured by <arg2> with medical issue at place',
	# 'Victim was injured by injurer with medical issue at <arg3> place',
	# 'Victim was injured by injurer with <arg4> medical issue at place'],
	# 'arg1': 'evt092arg01victim', 'evt092arg01victim': 'arg1', 'arg2': 'evt092arg02injurer',
	# 'evt092arg02injurer': 'arg2', 'arg3': 'evt092arg03place', 'evt092arg03place': 'arg3',
	# 'arg4': 'evt092arg04medicalissue', 'evt092arg04medicalissue': 'arg4'}
	print(ontology_dict[event_type.replace('n/a','unspecified')])
	input_template, output_template, context = create_gold_gen(ex, ontology_dict,
	True)
	ontology_dict = load_ontology()
	if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
	# [['<arg>', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'],
	# ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'],
	# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'],
	# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']]
	print(input_template)
	# [['people', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'],
	# ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'],
	# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'],
	# ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']]
	print(output_template)
	# print(input_template)
	# 4 4 4
	# print(len(input_template))
	# print(len(output_template))
	# print(len(context))
	[[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'],
	['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.'],
	['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"'],
	['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'],
	['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']]

	[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France',

	'One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.',
	'Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"',
	'Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack',
	'Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']

	prepare_data()