import random import datetime import pandas as pd random.seed(1996) DEV_RATIO = 0.10 def choose_best_casing(orig, predicted): num_upper_tokens = len([c == c.upper() for c in orig.upper()]) if num_upper_tokens > 0.5 * len(orig): return predicted return predicted def split_data(): events_main = [] texts_main = [] events_dev = [] texts_dev = [] with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f: titles_tc = [line.strip() for line in f] df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1") for idx, (_, row) in enumerate(df_all.iterrows()): if idx % 1000 == 0: print("Processing line:", idx) year = int(row["Anno"]) event_data = { "event:id": idx, "event:year": year, } text_data = { "event_id": idx, "text_id": idx, "pubyear": year, "language": "Italian", "provider": row["Testata"].lstrip("*T_"), "title": choose_best_casing(row["Titolo"], titles_tc[idx]), "title_truecased": titles_tc[idx], "title_orig": row["Titolo"] } if random.random() < DEV_RATIO: events_dev.append(event_data) texts_dev.append(text_data) with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: f_out.write(text_data["title"]) with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: f_out.write(text_data["title_orig"]) with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: f_out.write(text_data["title_truecased"]) else: events_main.append(event_data) texts_main.append(text_data) with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out: f_out.write(text_data["title"]) with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out: f_out.write(text_data["title_orig"]) with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out: f_out.write(text_data["title_truecased"]) pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv") pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv") pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv") pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv") if __name__ == "__main__": split_data()