import re from copy import deepcopy import transformers from transformers import BartTokenizer import jsonlines import json # dict = {"rel_triggers": [], "gold_rel_links": [], "doc_key": "nw_RC00c8620ef5810429342a1c339e6c76c1b0b9add3f6010f04482fd832", "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], [48, 48, [["evt043arg03place", 1.0]]], [32, 36, [["evt043arg02recipient", 1.0]]]], "language_id": "eng", "source_url": "http://bbc.co.uk/sport/athletics/36295481", "evt_triggers": [[31, 31, [["contact.prevarication.broadcast", 1.0]]]], "split": "test", "sentences": [["We", "are", "ashamed", "of", "them", ".", "\""], ["However", ",", "Mutko", "stopped", "short", "of", "admitting", "the", "doping", "scandal", "was", "state", "sponsored", "."], ["\"", "We", "are", "very", "sorry", "that", "athletes", "who", "tried", "to", "deceive", "us", ",", "and", "the", "world", ",", "were", "not", "caught", "sooner", "."], ["We", "are", "very", "sorry", "because", "Russia", "is", "committed", "to", "upholding", "the", "highest", "standards", "in", "sport", "and", "is", "opposed", "to", "anything", "that", "threatens", "the", "Olympic", "values", ",", "\"", "he", "said", "."], ["English", "former", "heptathlete", "and", "Athens", "2004", "bronze", "medallist", "Kelly", "Sotherton", "was", "unhappy", "with", "Mutko", "'s", "plea", "for", "Russia", "'s", "ban", "to", "be", "lifted", "for", "Rio"]], "gold_evt_links": [[[31, 31], [27, 27], "evt043arg01communicator"], [[31, 31], [32, 32], "evt043arg02recipient"], [[31, 31], [48, 48], "evt043arg03place"]], "clusters": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 91]], [[9, 9], [70, 70], [86, 87]]], "corefs": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 90]], [[9, 9], [70, 70], [86, 86]]]} # # template = "what is the " # # context_words = [w for sent in dict['sentences'] for w in sent] # # argtext = context_words[27] # # print(argtext) # # template = re.sub('', argtext, template) # # print(template) # for lidx, triple in enumerate(dict['gold_evt_links']): # # 触发词 论元 论元 # # 例子: "gold_evt_links": # # [[[40, 40], [33, 33], "evt089arg01victim"], # # [[40, 40], [28, 28], "evt089arg02place"]] # trigger_span, argument_span, arg_name = triple # # 第几个论元 # arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name] # # 具体论元内容 短语 # arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1]) # # 通过正则表达式的方式将模板中的每个 替换为具体的论元内容 # # 按照顺序将列表中的依次替换为 # template[lidx] = re.sub('<{}>'.format(arg_num), arg_text, template[lidx]) # # "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], # # [48, 48, [["evt043arg03place", 1.0]]], # # [32, 36, [["evt043arg02recipient", 1.0]]]] # context_words = [w for sent in dict['sentences'] for w in sent] # # print(context_words[32]) # print(context_words[33]) # print(context_words[34]) # print(context_words[35]) # print(context_words[36]) def get_event_type(ex): evt_type = [] for evt in ex['evt_triggers']: for t in evt[2]: evt_type.append(t[0]) return evt_type def create_gold_gen(ex, ontology_dict, mark_trigger=True): # 设置三个总列表、存放输入模板、输出模板 # 设置三个总列表、存放输入模板、输出模板 INPUT = [] OUTPUT = [] CONTEXT = [] evt_type = get_event_type(ex)[0] context_words = [w for sent in ex['sentences'] for w in sent] # print(context_words[48]) input_template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template'] i = len(input_template) input_list = [] for x in range(i): str = re.sub(r'', '', input_template[x]) input_list.append(str) # 其中input_list种存放的是 原始数据中 全部替换为 之后的模板 下一步应该进行分词 # temp = [] for x in range(i): space_tokenized_template = input_list[x].split(' ') INPUT.append(space_tokenized_template) space_tokenized_template = [] # 其中temp中存放的都是分词后的模板 下一步对temp中的所有元素进行tokenize tokenized_input_template = [] tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') temp = [] for x in range(len(INPUT)): for w in INPUT[x]: tokenized_input_template.extend(tokenizer.tokenize(w, add_prefix_space=True)) # print(tokenized_input_template) temp.append(tokenized_input_template) tokenized_input_template = [] print(temp) break template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template'] # if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df': # print('bbb') # [' was injured by injurer with medical issue at place', # 'Victim was injured by with medical issue at place', # 'Victim was injured by injurer with medical issue at place', # 'Victim was injured by injurer with medical issue at place'] # print(template) for lidx, triple in enumerate(ex['gold_evt_links']): # 触发词 论元 论元 # 例子: "gold_evt_links": # [[50, 50], [48, 48], 'evt092arg01victim'] # [[50, 50], [7, 7], 'evt092arg03place'] trigger_span, argument_span, arg_name = triple if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': print(len(ex['gold_evt_links'])) # 第几个论元 arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name] # 具体论元内容 短语 arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1]) # if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df': # print(arg_num) # print(arg_text) # print(arg_text) # 通过正则表达式的方式将模板中的每个 替换为具体的论元内容 # 搜索templat中的arg_num 找到对应的序列 # INDEX = 0 for index in range(len(template)): if arg_num in template[index]: break else: continue # INDEX += 1 template[index] = re.sub('<{}>'.format(arg_num), arg_text, template[index]) if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': print('aaa') print(template) trigger = ex['evt_triggers'][0] # 将context放入CONTEXT中 for w in range(i): CONTEXT.append(context_words) output_template = [] # 此时的template中已经全部替换为论元短语 这部是将 替换为 for i in range(len(template)): output_template.append(re.sub(r'', '', template[i])) # spaceout_tokenized_template = [] for i in range(len(output_template)): OUTPUT.append(output_template[i].split(' ')) # tokenized_out_template = [] # for i in range(len(spaceout_tokenized_template)): # for w in spaceout_tokenized_template[i]: # tokenized_out_template.extend(self.tokenizer.tokenize(w, add_prefix_space=True)) # OUTPUT.append(tokenized_out_template) # tokenized_out_template = [] return INPUT, OUTPUT, CONTEXT def load_ontology(): ontology_dict = {} with open('aida_ontology_fj-5.csv', 'r') as f: for lidx, line in enumerate(f): if lidx == 0: # header continue fields = line.strip().split(',') if len(fields) < 2: break evt_type = fields[0] if evt_type in ontology_dict.keys(): # 得到该事件类型下的所有论元类型 args = fields[2:] # 将该事件类型对应的模板中的论元模板 填充到onto_logy字典中 ontology_dict[evt_type]['template'].append(fields[1]) for i, arg in enumerate(args): if arg != '': # 事件类型下添加字典一项 arg1的值为arg ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1) # 即扫描到的事件类型在 evt_type_dict.keys() 还未存在过 else: # 建立该事件类型的key ontology_dict[evt_type] = {} args = fields[2:] ontology_dict[evt_type]['template'] = [] ontology_dict[evt_type]['template'].append(fields[1]) for i, arg in enumerate(args): if arg != '': # 事件类型下添加字典一项 arg1的值为arg ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1) return ontology_dict def prepare_data(): ontology_dict = load_ontology() # ('train', './data/RAMS_1.0/data/train.jsonlines'), # ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines') for split, f in [('val', './data/RAMS_1.0/data/dev.jsonlines'),('train', './data/RAMS_1.0/data/train.jsonlines'), ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')]: # , open('head_templates_preprocessed_data/{}.jsonl'.format(split), 'w') as writer with open(f, 'r') as reader: # print(ontology_dict['contact.prevarication.broadcast']) for lidx, line in enumerate(reader): ex = json.loads(line.strip()) # print(lidx) # print(ex) event_type = get_event_type(ex)[0] if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': # {'rel_triggers': [], 'gold_rel_links': [], # 'doc_key': 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df', # 'ent_spans': [[48, 48, [['evt092arg01victim', 1.0]]], [7, 7, [['evt092arg03place', 1.0]]]], # 'language_id': 'eng', # 'source_url': 'http://news.sky.com/story/attack-in-nice-truck-ploughes-into-crowd-10502068', # 'evt_triggers': [[50, 50, [['life.injure.n/a', 1.0]]]], 'split': 'test', # 'sentences': [[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'], # ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', # 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', # 'to', 'avoid', 'the', 'truck', '.'], # ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', # 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', # 'dead', 'in', 'the', 'road', '.', '"'], # ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'], # ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', # 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', # 'dead', ',', 'a', 'US', 'official', 'said', '.']], # 'gold_evt_links': [[[50, 50], [48, 48], 'evt092arg01victim'], # [[50, 50], [7, 7], 'evt092arg03place']], # 'clusters': [[[10, 11], [15, 15], [37, 37]], [[70, 71], [73, 73]]], # 'corefs': [[[11, 11], [15, 15], [37, 37]], [[71, 71], [73, 73]]]} print(ex) # {'template': [' was injured by injurer with medical issue at place', # 'Victim was injured by with medical issue at place', # 'Victim was injured by injurer with medical issue at place', # 'Victim was injured by injurer with medical issue at place'], # 'arg1': 'evt092arg01victim', 'evt092arg01victim': 'arg1', 'arg2': 'evt092arg02injurer', # 'evt092arg02injurer': 'arg2', 'arg3': 'evt092arg03place', 'evt092arg03place': 'arg3', # 'arg4': 'evt092arg04medicalissue', 'evt092arg04medicalissue': 'arg4'} print(ontology_dict[event_type.replace('n/a','unspecified')]) input_template, output_template, context = create_gold_gen(ex, ontology_dict, True) ontology_dict = load_ontology() if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f': # [['', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'], # ['Victim', 'was', 'injured', 'by', '', 'with', 'medical', 'issue', 'at', 'place'], # ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '', 'place'], # ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '', 'medical', 'issue', 'at', 'place']] print(input_template) # [['people', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'], # ['Victim', 'was', 'injured', 'by', '', 'with', 'medical', 'issue', 'at', 'place'], # ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '', 'place'], # ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '', 'medical', 'issue', 'at', 'place']] print(output_template) # print(input_template) # 4 4 4 # print(len(input_template)) # print(len(output_template)) # print(len(context)) [[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'], ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.'], ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"'], ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'], ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']] [':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France', 'One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.', 'Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"', 'Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack', 'Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.'] prepare_data()