Add prepare_data.py script to split data into train and validation sets
Browse files- prepare_data.py +21 -0
prepare_data.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn.model_selection import train_test_split
|
4 |
+
|
5 |
+
DATA_DIR = "data/train"
|
6 |
+
CSV_PATH = "data/trainLabels.csv"
|
7 |
+
TEST_SIZE = 0.2
|
8 |
+
RANDOM_STATE = 42
|
9 |
+
|
10 |
+
# Load the CSV file into a pandas DataFrame and add the image path
|
11 |
+
df = pd.read_csv(CSV_PATH, names=['image_path', 'label'], converters={'image_path': lambda x: f"{DATA_DIR}/{x}.jpeg"})
|
12 |
+
|
13 |
+
# drop row where image does not exist
|
14 |
+
df = df[df['image_path'].apply(lambda x: os.path.exists(x))]
|
15 |
+
|
16 |
+
# split the data into train and validation sets such that the class distribution is the same in both sets
|
17 |
+
df_train, df_val = train_test_split(df, test_size=TEST_SIZE, stratify=df['label'], random_state=RANDOM_STATE)
|
18 |
+
|
19 |
+
# Save the train and validation sets to CSV files
|
20 |
+
df_train.to_csv("data/train.csv", index=False)
|
21 |
+
df_val.to_csv("data/val.csv", index=False)
|