Spaces:
Build error
Build error
import json | |
import os | |
import random | |
import shutil | |
from datetime import datetime | |
import langdetect | |
import nltk | |
import pandas as pd | |
from langdetect import DetectorFactory, LangDetectException | |
DATA_FILE = "data/crashes/thecrashes_data_all_text.json" | |
DEV_PORTION = .10 | |
random.seed(2001) | |
DetectorFactory.seed = 0 | |
def is_a_real_time(timestamp): | |
"""Helper function, checks if a given timestamp really has a time""" | |
# 00:00:00 (midnight) is the "empty" timestamp, ignore it | |
if timestamp.hour == timestamp.minute == timestamp.second == 0: | |
return False | |
return True | |
def main(): | |
process_events() | |
def detect_language(article): | |
if article["alltext"]: | |
sample = article["alltext"] | |
elif article["summary"]: | |
sample = article["summary"] | |
else: | |
sample = article["title"] | |
try: | |
return langdetect.detect(sample) | |
except LangDetectException: | |
print(f"\tCould not detect language for text_id={article['id']}") | |
print(f"\tSample={sample})") | |
print() | |
return "UNK_LANG" | |
def extract_text_info(event): | |
ev_text_lines = [] | |
ev_id_lines = [] | |
ev_meta_rows = [] | |
for article in event["articles"]: | |
text_id = article["id"] | |
try: | |
pubdate = datetime.fromisoformat(article["publishedtime"]).strftime("%Y-%m-%d %H:%M:%S") | |
except ValueError: | |
print(f"\t\tcould not parse date {article['publishedtime']}") | |
pubdate = None | |
url = article["url"] | |
provider = article["sitename"] | |
title = article["title"] | |
language = detect_language(article) | |
ev_meta_rows.append({ | |
"event_id": event["id"], | |
"text_id": text_id, | |
"pubdate": pubdate, | |
"language": language, | |
"url": url, | |
"provider": provider, | |
"title": title | |
}) | |
summary = article["summary"] | |
body = article["alltext"] | |
text_lines = [] | |
id_lines = [] | |
for line in segment(title, language): | |
text_lines.append(line) | |
id_lines.append(f"event {event['id']}\ttext {text_id}\ttitle") | |
for line in segment(summary, language): | |
text_lines.append(line) | |
id_lines.append(f"event {event['id']}\ttext {text_id}\tsummary") | |
for line in segment(body, language): | |
text_lines.append(line) | |
id_lines.append(f"event {event['id']}\ttext {text_id}\tbody") | |
ev_text_lines.append(text_lines) | |
ev_id_lines.append(id_lines) | |
return ev_text_lines, ev_id_lines, ev_meta_rows | |
def segment(text, language): | |
# don't split Hebrew and Vietnamese (because we don't have a segmenter for it) | |
if language in ["he", "vi"]: | |
return text | |
lang_map = { | |
"nl": "dutch", | |
"en": "english", | |
"es": "spanish", | |
"de": "german", | |
"fr": "french", | |
"ru": "russian", | |
"pt": "portuguese" | |
} | |
nltk_lang = lang_map.get(language) | |
# what to do with languages without sent tokenizer in NLTK (apart from Hebrew): | |
if not nltk_lang: | |
if language == "af": | |
# treat Afrikaans as Dutch | |
nltk_lang = "dutch" | |
else: | |
print(f"Found an article with unsupported language={language}, falling back to English NLTK") | |
nltk_lang = "english" | |
return nltk.sent_tokenize(text, nltk_lang) | |
def write_to_text_by_event(text_lines, text_meta_lines, event_id, split_to_dir, split): | |
event_dir = f"{split_to_dir[split]}/{event_id}" | |
os.makedirs(event_dir, exist_ok=True) | |
for art_lines, row in zip(text_lines, text_meta_lines): | |
text_file = f"{event_dir}/{row['text_id']}.txt" | |
with open(text_file, "w", encoding="utf-8") as f: | |
for line in art_lines: | |
f.write(line + os.linesep) | |
def process_events(): | |
print("Loading data file...") | |
with open(DATA_FILE, encoding="utf-8") as f: | |
data = json.load(f) | |
event_all_rows = [] | |
event_dev_rows = [] | |
event_main_rows = [] | |
text_all_rows = [] | |
text_dev_rows = [] | |
text_main_rows = [] | |
# make empty text files | |
text_file_basenames = { | |
"all": "output/crashes/split_data/all.texts", | |
"dev": "output/crashes/split_data/split_dev10.texts", | |
"main": "output/crashes/split_data/split_main.texts" | |
} | |
for split, bn in text_file_basenames.items(): | |
for ext in [".text.txt", ".ids.txt"]: | |
f = open(f"{bn}{ext}", "w", encoding="utf-8") | |
f.close() | |
# clear & make text file directories | |
text_files_by_event_dir = {} | |
for split in ["all", "dev", "main"]: | |
prefix = "split_dev10" if split == "dev" else "split_main" if split == "main" else "all" | |
text_dir = f"output/crashes/split_data/{prefix}_texts_by_event" | |
text_files_by_event_dir[split] = text_dir | |
if os.path.exists(text_dir): | |
shutil.rmtree(text_dir) | |
os.mkdir(text_dir) | |
# helper function for writing text files | |
def append_to_txt(txt_file, lines): | |
with open(txt_file, "a", encoding="utf-8") as f_out: | |
for art_lines in lines: | |
for line in art_lines: | |
f_out.write(line + os.linesep) | |
print("Processing events...") | |
for event in data: | |
event_id = event["id"] | |
print(f"\tevent_id={event_id}") | |
try: | |
timestamp = datetime.fromisoformat(event["date"]) | |
except ValueError: | |
timestamp = None | |
event_row = { | |
"event:id": event_id, | |
"event:date": timestamp.strftime("%Y-%m-%d") if timestamp else None, | |
"event:time": timestamp.strftime("%H-%M-%S") if timestamp and is_a_real_time(timestamp) else None, | |
"event:coordinates": f"{event['latitude'], event['longitude']}", | |
"vehicle_involved": 1 if any(p for p in event["persons"] if p["transportationmode"] in range(5, 14)) else 0 | |
} | |
for health, health_code in (("dead", 3), ("injured", 2)): | |
all_with_health = [p for p in event["persons"] if p["health"] == health_code] | |
event_row[f"outcomes:{health}:total"] = len(all_with_health) | |
event_row[f"outcomes:{health}:child"] = len([p for p in all_with_health if p["child"] == 1]) | |
for mode, mode_codes in (("pedestrian", [1]), ("cyclist", [2]), ("vehicle", range(5, 14))): | |
event_row[f"outcomes:{health}:{mode}"] = len([p for p in all_with_health | |
if p["transportationmode"] in mode_codes]) | |
text_lines, text_id_lines, text_meta_rows = extract_text_info(event) | |
event_all_rows.append(event_row) | |
text_all_rows.extend(text_meta_rows) | |
append_to_txt(text_file_basenames["all"] + ".text.txt", text_lines) | |
append_to_txt(text_file_basenames["all"] + ".ids.txt", text_id_lines) | |
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "all") | |
if random.random() < DEV_PORTION: | |
event_dev_rows.append(event_row) | |
text_dev_rows.extend(text_meta_rows) | |
append_to_txt(text_file_basenames["dev"] + ".text.txt", text_lines) | |
append_to_txt(text_file_basenames["dev"] + ".ids.txt", text_id_lines) | |
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "dev") | |
else: | |
event_main_rows.append(event_row) | |
text_main_rows.extend(text_meta_rows) | |
append_to_txt(text_file_basenames["main"] + ".text.txt", text_lines) | |
append_to_txt(text_file_basenames["main"] + ".ids.txt", text_id_lines) | |
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "main") | |
all_ev_df = pd.DataFrame(event_all_rows) | |
main_ev_df = pd.DataFrame(event_main_rows) | |
dev_ev_df = pd.DataFrame(event_dev_rows) | |
for df, file in ((all_ev_df, "all.events"), (main_ev_df, "split_main.events"), (dev_ev_df, "split_dev10.events")): | |
df.to_csv(f"output/crashes/split_data/{file}.csv") | |
all_txt_df = pd.DataFrame(text_all_rows) | |
main_txt_df = pd.DataFrame(text_main_rows) | |
dev_txt_df = pd.DataFrame(text_dev_rows) | |
for df, file in ((all_txt_df, "all.texts"), (main_txt_df, "split_main.texts"), (dev_txt_df, "split_dev10.texts")): | |
df.to_csv(f"output/crashes/split_data/{file}.meta.csv") | |
if __name__ == '__main__': | |
main() | |