SoundOfWater / shared /utils /pandas_utils.py
bpiyush's picture
Upload folder using huggingface_hub
eafbf97 verified
raw
history blame
3.22 kB
"""Utility functions for pandas operations"""
from typing import List
import numpy as np
import pandas as pd
def apply_filters(df: pd.DataFrame, filters: dict, reset_index=False):
"""
Filters df based on given filters (key-values pairs).
"""
import omegaconf
X = df.copy()
all_indices = []
for col, values in filters.items():
if isinstance(values, (list, tuple, np.ndarray, omegaconf.listconfig.ListConfig)):
indices = X[col].isin(list(values))
else:
indices = X[col] == values
all_indices.append(indices)
# print(col, values, len(indices), sum(indices))
# X = X[indices]
if len(all_indices):
all_indices = np.array(all_indices)
indices = np.all(all_indices, axis=0)
X = X[indices]
if reset_index:
X = X.reset_index(drop=True)
return X
def apply_antifilters(df: pd.DataFrame, filters: dict, reset_index=False):
"""
Filters df removing rows for given filters (key-values pairs).
"""
X = df.copy()
for col, values in filters.items():
if isinstance(values, (list, tuple, np.ndarray)):
indices = X[col].isin(list(values))
else:
indices = X[col] == values
X = X[~indices]
if reset_index:
X = X.reset_index(drop=True)
return X
def custom_eval(x):
"""Splits string '["a", "b", "c"]' into ["a", "b", "c"]."""
if isinstance(x, str):
x = x.replace('[', '')
x = x.replace(']', '')
x = x.split(',')
x = [y.rstrip().lstrip() for y in x]
return x
else:
return ['NA']
def split_column_into_columns(df, column):
"""
For given df, splits `column` containing values like '["a", "b"]'
into one-hot subcolumns like a. b with `Yes`/`No` values.
"""
df[column] = df[column].apply(custom_eval)
unique_values = []
for i in range(len(df)):
index = df.index[i]
list_of_values = df.loc[index, column]
for x in list_of_values:
if (x != 'NA') and (x != ''):
df.at[index, x] = 'Yes'
if x not in unique_values:
unique_values.append(x)
df[unique_values] = df[unique_values].fillna('No')
df[f'any_{column}'] = df[unique_values].apply(
lambda x: 'Yes' if 'Yes' in list(x) else 'No', axis=1
)
return df
def custom_read_csv(path: str, columns_to_onehot: List) -> pd.DataFrame:
"""Custom CSV reader
Args:
path (str): path to .csv file
columns_to_onehot (List): list of columns to one-hotify
Returns:
pd.DataFrame: loaded df
"""
df = pd.read_csv(path)
for column in columns_to_onehot:
df = split_column_into_columns(df, column)
return df
def split_df(df, test_size=0.2):
from sklearn.model_selection import train_test_split
# split the dataframe into train and test sets
train_df, test_df = train_test_split(df, test_size=test_size, random_state=42)
# split the train set into train and validation sets
train_df, val_df = train_test_split(train_df, test_size=test_size, random_state=42)
return train_df, val_df, test_df