File size: 1,439 Bytes
b11ac48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import numpy as np

import random

random.seed(1996)


CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv"
CORPUS_ALL = "data/migration/corpus_all.csv"

RATIO_DEV = 0.05
RATIO_TEST = 0.25


def preprocess_annotated():
    print("Loading corpus...")
    df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
    print(f"\tfound {len(df)} annotated headlines")

    train_idx = []
    dev_idx = []
    test_idx = []

    print("Making random train/dev/test split...")
    for i in range(len(df)):
        rnd = random.random()
        if rnd < RATIO_DEV:
            dev_idx.append(i)
        elif rnd < (RATIO_DEV + RATIO_TEST):
            test_idx.append(i)
        else:
            train_idx.append(i)

    print(f"\tassigned {len(train_idx)} samples to train")
    print(f"\tassigned {len(dev_idx)} samples to dev")
    print(f"\tassigned {len(test_idx)} samples to test")

    df_train = df.iloc[train_idx]
    df_dev = df.iloc[dev_idx]
    df_test = df.iloc[test_idx]

    df_train.to_csv("output/migration/preprocess/annotations_train.csv")
    df_dev.to_csv("output/migration/preprocess/annotations_dev.csv")
    df_test.to_csv("output/migration/preprocess/annotations_test.csv")


def preprocess_all():
    df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
    for _, row in df.iterrows():
        pass



if __name__ == "__main__":
    # preprocess_annotated()
    preprocess_all()