Spaces:

adherent
/

Bart-gen-arg

Build error

File size: 16,112 Bytes

44a9d55

import re
from copy import deepcopy
import transformers
from transformers import BartTokenizer
import jsonlines
import json


# dict = {"rel_triggers": [], "gold_rel_links": [], "doc_key": "nw_RC00c8620ef5810429342a1c339e6c76c1b0b9add3f6010f04482fd832", "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]], [48, 48, [["evt043arg03place", 1.0]]], [32, 36, [["evt043arg02recipient", 1.0]]]], "language_id": "eng", "source_url": "http://bbc.co.uk/sport/athletics/36295481", "evt_triggers": [[31, 31, [["contact.prevarication.broadcast", 1.0]]]], "split": "test", "sentences": [["We", "are", "ashamed", "of", "them", ".", "\""], ["However", ",", "Mutko", "stopped", "short", "of", "admitting", "the", "doping", "scandal", "was", "state", "sponsored", "."], ["\"", "We", "are", "very", "sorry", "that", "athletes", "who", "tried", "to", "deceive", "us", ",", "and", "the", "world", ",", "were", "not", "caught", "sooner", "."], ["We", "are", "very", "sorry", "because", "Russia", "is", "committed", "to", "upholding", "the", "highest", "standards", "in", "sport", "and", "is", "opposed", "to", "anything", "that", "threatens", "the", "Olympic", "values", ",", "\"", "he", "said", "."], ["English", "former", "heptathlete", "and", "Athens", "2004", "bronze", "medallist", "Kelly", "Sotherton", "was", "unhappy", "with", "Mutko", "'s", "plea", "for", "Russia", "'s", "ban", "to", "be", "lifted", "for", "Rio"]], "gold_evt_links": [[[31, 31], [27, 27], "evt043arg01communicator"], [[31, 31], [32, 32], "evt043arg02recipient"], [[31, 31], [48, 48], "evt043arg03place"]], "clusters": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 91]], [[9, 9], [70, 70], [86, 87]]], "corefs": [[[0, 0], [22, 22], [32, 32], [43, 43], [48, 48], [90, 90]], [[9, 9], [70, 70], [86, 86]]]}
#
# template = "what is the <arg>"
#
# context_words = [w for sent in dict['sentences'] for w in sent]
#
# argtext = context_words[27]
#
# print(argtext)
#
# template = re.sub('<arg>', argtext, template)
#
# print(template)

# for lidx, triple in enumerate(dict['gold_evt_links']):
        # # 触发词 论元 论元
        # # 例子： "gold_evt_links":
        # # [[[40, 40], [33, 33], "evt089arg01victim"],
        # #  [[40, 40], [28, 28], "evt089arg02place"]]
        # trigger_span, argument_span, arg_name = triple
        # # 第几个论元
        # arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name]
        # # 具体论元内容 短语
        # arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1])
        # # 通过正则表达式的方式将模板中的每个<arg>  替换为具体的论元内容
        # # 按照顺序将列表中的<arg>依次替换为
        # template[lidx] = re.sub('<{}>'.format(arg_num), arg_text, template[lidx])
# # "ent_spans": [[27, 27, [["evt043arg01communicator", 1.0]]],
# #               [48, 48, [["evt043arg03place", 1.0]]],
# #               [32, 36, [["evt043arg02recipient", 1.0]]]]
# context_words = [w for sent in dict['sentences'] for w in sent]
#
# print(context_words[32])
# print(context_words[33])
# print(context_words[34])
# print(context_words[35])
# print(context_words[36])

def get_event_type(ex):
    evt_type = []
    for evt in ex['evt_triggers']:
        for t in evt[2]:
            evt_type.append(t[0])
    return evt_type

def create_gold_gen(ex, ontology_dict, mark_trigger=True):
    # 设置三个总列表、存放输入模板、输出模板
    # 设置三个总列表、存放输入模板、输出模板
    INPUT = []
    OUTPUT = []
    CONTEXT = []
    evt_type = get_event_type(ex)[0]
    context_words = [w for sent in ex['sentences'] for w in sent]
    # print(context_words[48])
    input_template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template']
    i = len(input_template)
    input_list = []
    for x in range(i):
        str = re.sub(r'<arg\d>', '<arg>', input_template[x])
        input_list.append(str)
    # 其中input_list种存放的是 原始数据中<arg1> 全部替换为 <arg> 之后的模板 下一步应该进行分词
    # temp = []
    for x in range(i):
        space_tokenized_template = input_list[x].split(' ')
        INPUT.append(space_tokenized_template)
        space_tokenized_template = []
    # 其中temp中存放的都是分词后的模板 下一步对temp中的所有元素进行tokenize
    tokenized_input_template = []
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
    temp = []
    for x in range(len(INPUT)):
        for w in INPUT[x]:
          tokenized_input_template.extend(tokenizer.tokenize(w, add_prefix_space=True))
          
          # print(tokenized_input_template)
          
          temp.append(tokenized_input_template)
          tokenized_input_template = []
        print(temp)
        break
    template = ontology_dict[evt_type.replace('n/a', 'unspecified')]['template']

    # if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df':
        # print('bbb')
          # ['<arg1> was injured by injurer with medical issue at place',
          # 'Victim was injured by <arg2> with medical issue at place',
          # 'Victim was injured by injurer with medical issue at <arg3> place',
          # 'Victim was injured by injurer with <arg4> medical issue at place']
        # print(template)
    for lidx, triple in enumerate(ex['gold_evt_links']):
        # 触发词 论元 论元
        # 例子： "gold_evt_links":
        # [[50, 50], [48, 48], 'evt092arg01victim']
        # [[50, 50], [7, 7], 'evt092arg03place']

        trigger_span, argument_span, arg_name = triple
        if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
            print(len(ex['gold_evt_links']))
        # 第几个论元
        arg_num = ontology_dict[evt_type.replace('n/a', 'unspecified')][arg_name]
        # 具体论元内容 短语
        arg_text = ' '.join(context_words[argument_span[0]:argument_span[1] + 1])
        # if ex['doc_key'] == 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df':
        #     print(arg_num)
        #     print(arg_text)
        # print(arg_text)
        # 通过正则表达式的方式将模板中的每个<arg>  替换为具体的论元内容
        # 搜索templat中的arg_num 找到对应的序列
        # INDEX = 0
        for index in range(len(template)):
            if arg_num in template[index]:
                break
            else:
                continue
                # INDEX += 1
        template[index] = re.sub('<{}>'.format(arg_num), arg_text, template[index])
    if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
        print('aaa')
        print(template)
    trigger = ex['evt_triggers'][0]

    # 将context放入CONTEXT中
    for w in range(i):
        CONTEXT.append(context_words)
    output_template = []
    # 此时的template中已经全部替换为论元短语 这部是将<arg1> 替换为<arg>
    for i in range(len(template)):
        output_template.append(re.sub(r'<arg\d>', '<arg>', template[i]))
    # spaceout_tokenized_template = []
    for i in range(len(output_template)):
        OUTPUT.append(output_template[i].split(' '))

    # tokenized_out_template = []
    # for i in range(len(spaceout_tokenized_template)):
    #     for w in spaceout_tokenized_template[i]:
    #         tokenized_out_template.extend(self.tokenizer.tokenize(w, add_prefix_space=True))
    #     OUTPUT.append(tokenized_out_template)
    #     tokenized_out_template = []

    return INPUT, OUTPUT, CONTEXT


def load_ontology():
    ontology_dict = {}
    with open('aida_ontology_fj-5.csv', 'r') as f:
        for lidx, line in enumerate(f):
            if lidx == 0:  # header
                continue
            fields = line.strip().split(',')
            if len(fields) < 2:
                break
            evt_type = fields[0]
            if evt_type in ontology_dict.keys():
                # 得到该事件类型下的所有论元类型
                args = fields[2:]
                # 将该事件类型对应的模板中的论元模板 填充到onto_logy字典中
                ontology_dict[evt_type]['template'].append(fields[1])
                for i, arg in enumerate(args):
                    if arg != '':
                        # 事件类型下添加字典一项 arg1的值为arg
                        ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg
                        ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1)
            # 即扫描到的事件类型在 evt_type_dict.keys() 还未存在过
            else:
                # 建立该事件类型的key
                ontology_dict[evt_type] = {}
                args = fields[2:]
                ontology_dict[evt_type]['template'] = []
                ontology_dict[evt_type]['template'].append(fields[1])
                for i, arg in enumerate(args):
                    if arg != '':
                        # 事件类型下添加字典一项 arg1的值为arg
                        ontology_dict[evt_type]['arg{}'.format(i + 1)] = arg
                        ontology_dict[evt_type][arg] = 'arg{}'.format(i + 1)

    return ontology_dict

def prepare_data():

    ontology_dict = load_ontology()
    # ('train', './data/RAMS_1.0/data/train.jsonlines'),
    # ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')
    for split, f in [('val', './data/RAMS_1.0/data/dev.jsonlines'),('train', './data/RAMS_1.0/data/train.jsonlines'),
                                                        ('test', './data/RAMS_1.0/data/test_head_coref.jsonlines')]:
    # , open('head_templates_preprocessed_data/{}.jsonl'.format(split), 'w') as writer
        with open(f, 'r') as reader:

            # print(ontology_dict['contact.prevarication.broadcast'])
            for lidx, line in enumerate(reader):
                ex = json.loads(line.strip())
                # print(lidx)
                # print(ex)
                event_type = get_event_type(ex)[0]
                if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
                    # {'rel_triggers': [], 'gold_rel_links': [],
                    #  'doc_key': 'nw_RC04992035300b2ec94d8692646a28dc8b5f210c94842d20834c5342df',
                    #  'ent_spans': [[48, 48, [['evt092arg01victim', 1.0]]], [7, 7, [['evt092arg03place', 1.0]]]],
                    #  'language_id': 'eng',
                    #  'source_url': 'http://news.sky.com/story/attack-in-nice-truck-ploughes-into-crowd-10502068',
                    #  'evt_triggers': [[50, 50, [['life.injure.n/a', 1.0]]]], 'split': 'test',
                    #  'sentences': [[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'],
                    #                ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and',
                    #                 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below',
                    #                 'to', 'avoid', 'the', 'truck', '.'],
                    #                ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There',
                    #                 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or',
                    #                 'dead', 'in', 'the', 'road', '.', '"'],
                    #                ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'],
                    #                ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old',
                    #                 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the',
                    #                 'dead', ',', 'a', 'US', 'official', 'said', '.']],
                    #  'gold_evt_links': [[[50, 50], [48, 48], 'evt092arg01victim'],
                    #                     [[50, 50], [7, 7], 'evt092arg03place']],
                    #  'clusters': [[[10, 11], [15, 15], [37, 37]], [[70, 71], [73, 73]]],
                    #  'corefs': [[[11, 11], [15, 15], [37, 37]], [[71, 71], [73, 73]]]}
                    print(ex)
                    # {'template': ['<arg1> was injured by injurer with medical issue at place',
                    #               'Victim was injured by <arg2> with medical issue at place',
                    #               'Victim was injured by injurer with medical issue at <arg3> place',
                    #               'Victim was injured by injurer with <arg4> medical issue at place'],
                    #  'arg1': 'evt092arg01victim', 'evt092arg01victim': 'arg1', 'arg2': 'evt092arg02injurer',
                    #  'evt092arg02injurer': 'arg2', 'arg3': 'evt092arg03place', 'evt092arg03place': 'arg3',
                    #  'arg4': 'evt092arg04medicalissue', 'evt092arg04medicalissue': 'arg4'}
                    print(ontology_dict[event_type.replace('n/a','unspecified')])
                input_template, output_template, context = create_gold_gen(ex, ontology_dict,
                                                                              True)
                ontology_dict = load_ontology()
                if ex['doc_key'] == 'nw_RC013c8e78b7e8a4fb22193483877058f712dfd8b75b7a06d950de0b8f':
                    # [['<arg>', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'],
                    #  ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'],
                    #  ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'],
                    #  ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']]
                    print(input_template)
                    # [['people', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', 'place'],
                    #  ['Victim', 'was', 'injured', 'by', '<arg>', 'with', 'medical', 'issue', 'at', 'place'],
                    #  ['Victim', 'was', 'injured', 'by', 'injurer', 'with', 'medical', 'issue', 'at', '<arg>', 'place'],
                    #  ['Victim', 'was', 'injured', 'by', 'injurer', 'with', '<arg>', 'medical', 'issue', 'at', 'place']]
                    print(output_template)
                # print(input_template)
                # 4 4 4
                #     print(len(input_template))
                #     print(len(output_template))
                #     print(len(context))
[[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France'],
 ['One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.'],
 ['Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"'],
 ['Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack'],
 ['Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']]

[':', ':', 'History', 'Of', 'Deadly', 'Attacks', 'In', 'France',

 'One', ',', 'Laurence', 'Olding', ',', 'was', 'with', 'his', 'fiance', 'and', 'jumped', 'over', 'a', 'promenade', 'wall', 'onto', 'the', 'concrete', 'below', 'to', 'avoid', 'the', 'truck', '.',
 'Emerging', 'from', 'hospital', 'in', 'bandages', 'he', 'said', ':', '"', 'There', 'was', 'debris', 'in', 'the', 'streets', ',', 'people', 'lying', 'injured', 'or', 'dead', 'in', 'the', 'road', '.', '"',
 'Video', ':', 'Hollande', 'On', "'", 'Horror', "'", 'Of', 'Attack',
 'Two', 'Americans', '-', 'Sean', 'Copeland', 'and', 'his', '11-year', '-', 'old', 'son', 'Brodie', 'from', 'Texas', '-', 'have', 'been', 'confirmed', 'among', 'the', 'dead', ',', 'a', 'US', 'official', 'said', '.']

prepare_data()