import random import json import os random.seed(2021) NUM_SENTENCES = 100_000 NUM_FAILS = 25 SENT_TYPES = ("0_PTY", "1_PTY", "2_PTY") SENT_1_PTY_TYPES = ("VICTIM", "OUTCOME", "DRIVE") SENT_ACTIVE_TYPES = ("ACTIVE", "NON_ACTIVE") SENTS_0_PTY_OUTCOME = ("[[OUTCOME]] [[CIRCUMSTANCE]] [[PLACE]]", "[[OUTCOME]] [[CIRCUMSTANCE]] [[TIME]]", "[[OUTCOME]] [[CIRCUMSTANCE]]") SENTS_1_PTY_VICTIM = ("[[SUBJECT]] [[VERB_V2]] [[PLACE]]", "[[SUBJECT]] [[TIME]] [[VERB_V2]]", "[[SUBJECT]] [[VERB_V2]]") SENTS_1_PTY_OUTCOME = ("[[SUBJECT]] [[OUTCOME]] [[PLACE]] [[CIRCUMSTANCE]]", "[[SUBJECT]] [[OUTCOME]] [[CIRCUMSTANCE]]") SENTS_1_PTY_DRIVE = ("[[SUBJECT]] [[VP_DRIVE]] [[PLACE]]", "[[SUBJECT]] [[VP_DRIVE]]") SENTS_2_PTYS = ("[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]] [[PLACE]]", "[[SUBJECT]] [[VERB_V2]] [[TIME]] [[VERB_P]] [[OTHER]] [[VERB_REST]]", "[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]]") PLACES = ("op stationsplein", "in stadscentrum", "op kruispunt Westerhaven", "op A27", "op A10", "in Lelystad", "in Assen", "in Amsterdam", "bij Renkum", "in Schilderswijk", "bij knooppunt Lunetten", "op zuidelijke ringweg", "in de buurt van de Erasmusbrug", "op schoolplein Stedelijk Gymnasium", "bij afrit Rotterdam-Noord", "op Kanaleneiland") TIMES = ("tijdens avondspits", "vrijdagavond", "dinsdagochtend", "donderdagnacht", "rond middaguur") CIRCUMSTANCES = ("na ongeluk", "na aanrijding", "na botsing", "na crash") CIRCUMSTANCES_AGT = (", dader ervandoor", ", dader ervandoor", ", dader rijdt door", ", bestuurder rijdt door") OUTCOME_0_TYPES = ("TRAFFIC", "HUMAN") OUTCOMES_0_TRAFFIC = ("verkeersopstopping", "file", "veel vertraging") OUTCOMES_0_HUMAN = ("dode", "zwaargewonde", "gewonde", "drie gewonden") OUTCOMES_1 = ("dood", "overleden", "zwaargewond", "lichtgewond", "ongedeerd") SUBJECT_TYPES = ("WEAK_PTY", "DRIVER", "VERHICLE") VPS_DRIVE_ACTIVE = ("rijdt tegen boom", "veroorzaakt ongeluk") VPS_DRIVE_NON_ACTIVE = ("verongelukt", "gecrasht", "uit de bocht gevlogen", "raakt gewond", "raakt gewond door klap") EVENT_VERBS_1_VICTIM = ("aangereden", "geschept", "raakt gewond", "raakt gewond door klap") EVENT_VERBS_2_ACTIVE_ANY = ("raakt|_|_", "botst|op|_", "botst|tegen|_") EVENT_VERBS_2_ACTIVE_DRIVE = ("rijdt|_|aan", "rijdt|_|dood", "schept|_|_") EVENT_VERBS_2_NON_ACTIVE_DRIVER = ( "aangereden|door|_", "geschept|door|_") EVENT_VERBS_2_NON_ACTIVE_VEHICLE = ( "aangereden|door|_", "geschept|door|_", "komt|onder|_") EVENT_VERBS_2_NON_ACTIVE_ANY = ( "geraakt|door|_",) WEAK_PTY_NPS = ("fietser", "skateboarder", "wielrenner", "rolschaatser", "jogger", "voetganger", "motorrijder", "fietskoerier", "[[PERSON]] op fiets", "[[PERSON]] op e-bike") ANY_PERSON_NPS = ("vrouw", "man", "meisje", "jongen", "bejaarde vrouw", "bejaarde man", "Duitser", "toerist") CYCLIST_PERSON_NPS = ("postbode", "maaltijdbezorger", "politieagent") DRIVER_NPS = ("automobilist", "automobiliste", "bestuurder", "dronken automobilist", "dronken bestuurder", "motorrijder", "minderjarige bestuurder", "trucker", "taxichauffeur", "[[PERSON]] in auto", "dronken [[PERSON]] in auto") VEHICLE_NPS = ("auto", "personenauto", "vrachtwagen", "tractor", "auto met caravan", "scooter", "motor", "tram", "stadsbus", "lijn 10", "touringcar", "camper", "vorkheftruck") def generate_weak_pty(): noun_phrase = random.choice(WEAK_PTY_NPS) if "[[PERSON]]" in noun_phrase: person = random.choice(ANY_PERSON_NPS + CYCLIST_PERSON_NPS) return noun_phrase.replace("[[PERSON]]", person) else: return noun_phrase def generate_driver(): noun_phrase = random.choice(DRIVER_NPS) if "[[PERSON]]" in noun_phrase: person = random.choice(ANY_PERSON_NPS) return noun_phrase.replace("[[PERSON]]", person) else: return noun_phrase def make_sentence(template, fields): sentence = template for field, value in fields.items(): sentence = sentence.replace(f"[[{field}]]", value) sentence = sentence.replace("_", "").replace(" ", " ").strip() sentence = sentence[0].upper() + sentence[1:] return sentence def main(): sentences = {} dup_fails = 0 while len(sentences) < NUM_SENTENCES and dup_fails < NUM_FAILS: fields = {} label = {"party_mentioned": 0, "party_human": 0, "active": False} fields["TIME"] = random.choice(TIMES) fields["PLACE"] = random.choice(PLACES) sent_type = random.choice(SENT_TYPES) if sent_type == "0_PTY": if random.random() < 0.5: fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES) else: fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT) label["party_mentioned"] += 1 label["party_human"] += 1 outcome_type = random.choice(OUTCOME_0_TYPES) if outcome_type == "TRAFFIC": fields["OUTCOME"] = random.choice(OUTCOMES_0_TRAFFIC) else: fields["OUTCOME"] = random.choice(OUTCOMES_0_HUMAN) label["party_mentioned"] += 1 label["party_human"] += 1 sentence = make_sentence( random.choice(SENTS_0_PTY_OUTCOME), fields) elif sent_type == "1_PTY": if random.random() < 0.5: fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES) else: fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT) label["party_mentioned"] += 1 label["party_human"] += 1 sent_subtype = random.choice(SENT_1_PTY_TYPES) if sent_subtype == "VICTIM": label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_weak_pty() fields["VERB_V2"] = random.choice(EVENT_VERBS_1_VICTIM) sentence = make_sentence( random.choice(SENTS_1_PTY_VICTIM), fields) elif sent_subtype == "OUTCOME": subject_type = random.choice(["WEAK_PTY", "DRIVER"]) fields["OUTCOME"] = random.choice(OUTCOMES_1) if subject_type == "WEAK_PTY": label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_weak_pty() else: # driver label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_driver() sentence = make_sentence( random.choice(SENTS_1_PTY_OUTCOME), fields) else: # drive subject_type = random.choice(["DRIVER", "VERHICLE"]) active_type = random.choice(SENT_ACTIVE_TYPES) if active_type == "ACTIVE": fields["VP_DRIVE"] = random.choice(VPS_DRIVE_ACTIVE) label["active"] = True else: fields["VP_DRIVE"] = random.choice(VPS_DRIVE_NON_ACTIVE) if subject_type == "DRIVER": label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_driver() else: # vehicle label["party_mentioned"] += 1 fields["SUBJECT"] = random.choice(VEHICLE_NPS) sentence = make_sentence( random.choice(SENTS_1_PTY_DRIVE), fields) else: # 2 pty active_type = random.choice(SENT_ACTIVE_TYPES) if active_type == "ACTIVE": subject_type = random.choice(["WEAK_PTY", "DRIVER", "VERHICLE"]) label["active"] = True if subject_type == "WEAK_PTY": label["party_mentioned"] += 1 label["party_human"] += 1 fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_ACTIVE_ANY).split("|") fields["SUBJECT"] = generate_weak_pty() other_type = random.choice(["WEAK_PTY", "VEHICLE"]) elif subject_type == "DRIVER": label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_driver() if random.random() < 0.5: fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_ACTIVE_ANY).split("|") other_type = random.choice(["WEAK_PTY", "VEHICLE"]) else: fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_ACTIVE_DRIVE).split("|") other_type = "WEAK_PTY" else: # vehicle label["party_mentioned"] += 1 fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_ACTIVE_ANY + EVENT_VERBS_2_ACTIVE_DRIVE).split("|") fields["SUBJECT"] = random.choice(VEHICLE_NPS) if other_type == "WEAK_PTY": label["party_mentioned"] += 1 label["party_human"] += 1 fields["OTHER"] = generate_weak_pty() elif other_type == "DRIVER": label["party_mentioned"] += 1 label["party_human"] += 1 fields["OTHER"] = generate_driver() else: # vehicle label["party_mentioned"] += 1 fields["OTHER"] = random.choice(VEHICLE_NPS) else: # non-active other_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) if other_type == "WEAK_PTY": label["party_mentioned"] += 1 label["party_human"] += 1 fields["OTHER"] = generate_weak_pty() fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_NON_ACTIVE_ANY).split("|") subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) elif other_type == "DRIVER": label["party_mentioned"] += 1 label["party_human"] += 1 fields["OTHER"] = generate_driver() if random.random() < 0.5: fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_NON_ACTIVE_ANY).split("|") subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) else: fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_NON_ACTIVE_DRIVER).split("|") subject_type = random.choice(["WEAK_PTY"]) else: # "vehicle" label["party_mentioned"] += 1 fields["OTHER"] = random.choice(VEHICLE_NPS) if random.random() < 0.5: fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_NON_ACTIVE_ANY).split("|") subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"]) else: fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice( EVENT_VERBS_2_NON_ACTIVE_VEHICLE).split("|") subject_type = random.choice(["WEAK_PTY"]) if subject_type == "WEAK_PTY": label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_weak_pty() elif subject_type == "DRIVER": label["party_mentioned"] += 1 label["party_human"] += 1 fields["SUBJECT"] = generate_driver() else: # vehicle label["party_mentioned"] += 1 fields["SUBJECT"] = random.choice(VEHICLE_NPS) sentence = make_sentence(random.choice(SENTS_2_PTYS), fields) if sentence not in sentences: sentences[sentence] = label dup_fails = 0 else: dup_fails += 1 with open("output/crashes/generate_templates/sentences.jsonl", "w", encoding="utf-8") as f_out: for sentence, label in sentences.items(): f_out.write(json.dumps({"sentence": sentence, "label": label}) + os.linesep) f_out.write(os.linesep) if __name__ == "__main__": main()