Spaces:
Build error
Build error
File size: 1,439 Bytes
b11ac48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import pandas as pd
import numpy as np
import random
random.seed(1996)
CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv"
CORPUS_ALL = "data/migration/corpus_all.csv"
RATIO_DEV = 0.05
RATIO_TEST = 0.25
def preprocess_annotated():
print("Loading corpus...")
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
print(f"\tfound {len(df)} annotated headlines")
train_idx = []
dev_idx = []
test_idx = []
print("Making random train/dev/test split...")
for i in range(len(df)):
rnd = random.random()
if rnd < RATIO_DEV:
dev_idx.append(i)
elif rnd < (RATIO_DEV + RATIO_TEST):
test_idx.append(i)
else:
train_idx.append(i)
print(f"\tassigned {len(train_idx)} samples to train")
print(f"\tassigned {len(dev_idx)} samples to dev")
print(f"\tassigned {len(test_idx)} samples to test")
df_train = df.iloc[train_idx]
df_dev = df.iloc[dev_idx]
df_test = df.iloc[test_idx]
df_train.to_csv("output/migration/preprocess/annotations_train.csv")
df_dev.to_csv("output/migration/preprocess/annotations_dev.csv")
df_test.to_csv("output/migration/preprocess/annotations_test.csv")
def preprocess_all():
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
for _, row in df.iterrows():
pass
if __name__ == "__main__":
# preprocess_annotated()
preprocess_all()
|