import pandas as pd import numpy as np import random random.seed(1996) CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv" CORPUS_ALL = "data/migration/corpus_all.csv" RATIO_DEV = 0.05 RATIO_TEST = 0.25 def preprocess_annotated(): print("Loading corpus...") df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1") print(f"\tfound {len(df)} annotated headlines") train_idx = [] dev_idx = [] test_idx = [] print("Making random train/dev/test split...") for i in range(len(df)): rnd = random.random() if rnd < RATIO_DEV: dev_idx.append(i) elif rnd < (RATIO_DEV + RATIO_TEST): test_idx.append(i) else: train_idx.append(i) print(f"\tassigned {len(train_idx)} samples to train") print(f"\tassigned {len(dev_idx)} samples to dev") print(f"\tassigned {len(test_idx)} samples to test") df_train = df.iloc[train_idx] df_dev = df.iloc[dev_idx] df_test = df.iloc[test_idx] df_train.to_csv("output/migration/preprocess/annotations_train.csv") df_dev.to_csv("output/migration/preprocess/annotations_dev.csv") df_test.to_csv("output/migration/preprocess/annotations_test.csv") def preprocess_all(): df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1") for _, row in df.iterrows(): pass if __name__ == "__main__": # preprocess_annotated() preprocess_all()