Spaces:

ucinlp
/

Modeling-Uncertainty-in-Explainability

Runtime error

File size: 7,474 Bytes

"""Routines for processing data."""
import numpy as np
import os
import pandas as pd
from PIL import Image
from skimage.segmentation import slic, mark_boundaries

import torch
from torchvision import datasets, transforms

# The number of segments to use for the images
NSEGMENTS = 20
PARAMS = {
    'protected_class': 1,
    'unprotected_class': 0,
    'positive_outcome': 1,
    'negative_outcome': 0
}
IMAGENET_LABELS = {
    'french_bulldog': 245,
    'scuba_diver': 983,
    'corn': 987,
    'broccoli': 927
}

def get_and_preprocess_compas_data():
    """Handle processing of COMPAS according to: https://github.com/propublica/compas-analysis
    
    Parameters
    ----------
    params : Params
    Returns
    ----------
    Pandas data frame X of processed data, np.ndarray y, and list of column names
    """
    PROTECTED_CLASS = PARAMS['protected_class']
    UNPROTECTED_CLASS = PARAMS['unprotected_class']
    POSITIVE_OUTCOME = PARAMS['positive_outcome']
    NEGATIVE_OUTCOME = PARAMS['negative_outcome']

    compas_df = pd.read_csv("./data/compas-scores-two-years.csv", index_col=0)
    compas_df = compas_df.loc[(compas_df['days_b_screening_arrest'] <= 30) &
                              (compas_df['days_b_screening_arrest'] >= -30) &
                              (compas_df['is_recid'] != -1) &
                              (compas_df['c_charge_degree'] != "O") &
                              (compas_df['score_text'] != "NA")]

    compas_df['length_of_stay'] = (pd.to_datetime(compas_df['c_jail_out']) - pd.to_datetime(compas_df['c_jail_in'])).dt.days
    X = compas_df[['age', 'two_year_recid','c_charge_degree', 'race', 'sex', 'priors_count', 'length_of_stay']]

    # if person has high score give them the _negative_ model outcome
    y = np.array([NEGATIVE_OUTCOME if score == 'High' else POSITIVE_OUTCOME for score in compas_df['score_text']])
    sens = X.pop('race')

    # assign African-American as the protected class
    X = pd.get_dummies(X)
    sensitive_attr = np.array(pd.get_dummies(sens).pop('African-American'))
    X['race'] = sensitive_attr

    # make sure everything is lining up
    assert all((sens == 'African-American') == (X['race'] == PROTECTED_CLASS))
    cols = [col for col in X]

    categorical_features = [1, 4, 5, 6, 7, 8]

    output = {
        "X": X.values,
        "y": y,
        "column_names": cols,
        "cat_indices": categorical_features
    }
    
    return output

def get_and_preprocess_german():
    """"Handle processing of German.  We use a preprocessed version of German from Ustun et. al.
    https://arxiv.org/abs/1809.06514.  Thanks Berk!
    Parameters:
    ----------
    params : Params
    Returns:
    ----------
    Pandas data frame X of processed data, np.ndarray y, and list of column names
    """
    PROTECTED_CLASS = PARAMS['protected_class']
    UNPROTECTED_CLASS = PARAMS['unprotected_class']
    POSITIVE_OUTCOME = PARAMS['positive_outcome']
    NEGATIVE_OUTCOME = PARAMS['negative_outcome']

    X = pd.read_csv("./data/german_processed.csv")
    y = X["GoodCustomer"]

    X = X.drop(["GoodCustomer", "PurposeOfLoan"], axis=1)
    X['Gender'] = [1 if v == "Male" else 0 for v in X['Gender'].values]

    y = np.array([POSITIVE_OUTCOME if p == 1 else NEGATIVE_OUTCOME for p in y.values])
    categorical_features = [0, 1, 2] + list(range(9, X.shape[1]))

    output = {
        "X": X.values,
        "y": y,
        "column_names": [c for c in X],
        "cat_indices": categorical_features,
    }

    return output

def get_PIL_transf(): 
    """Gets the PIL image transformation."""
    transf = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.CenterCrop(224)
    ])    
    return transf  

# def load_image(path):
#     """Loads an image by path."""
#     with open(os.path.abspath(path), 'rb') as f:
#         with Image.open(f) as img:
#             return img.convert('RGB') 

def load_image(pil_image):
    """Loads an image by path."""
    return pil_image.convert('RGB') 

def get_imagenet(pil_image, get_label=True):
    """Gets the imagenet data.

    Arguments:
        name: The name of the imagenet dataset
    """
    # images_paths = []

    # Store all the paths of the images
    # data_dir = os.path.join("./data", name)
    # for (dirpath, dirnames, filenames) in os.walk(data_dir):
    #     for fn in filenames:
    #         if fn != ".DS_Store":
    #             images_paths.append(os.path.join(dirpath, fn))
    
    # Load & do transforms for the images
    pill_transf = get_PIL_transf()
    images, segs = [], []
    img = load_image(pil_image)
    PIL_transformed_image = np.array(pill_transf(img))
    segments = slic(PIL_transformed_image, n_segments=NSEGMENTS, compactness=100, sigma=1)

    images.append(PIL_transformed_image)
    segs.append(segments)

    images = np.array(images)

    # if get_label:
    #     assert name in IMAGENET_LABELS, "Get label set to True but name not in known imagenet labels"
    #     y = np.ones(images.shape[0]) * IMAGENET_LABELS[name]
    # else:
    y = np.ones(images.shape[0]) * -1

    segs = np.array(segs)

    output = {
        "X": images,
        "y": y,
        "segments": segs
    }

    return output


def get_mnist(num):
    """Gets the MNIST data for a certain digit.

    Arguments: 
        num: The mnist digit to get
    """

    # Get the mnist data
    test_loader = torch.utils.data.DataLoader(datasets.MNIST('./data/mnist', 
                                                             train=False, 
                                                             download=True, 
                                                             transform=transforms.Compose([transforms.ToTensor(),
                                                                                            transforms.Normalize((0.1307,), (0.3081,))
                                                                                           ])),
                                                             batch_size=1, 
                                                             shuffle=False)

    all_test_mnist_of_label_num, all_test_segments_of_label_num = [], []

    # Get all instances of label num
    for data, y in test_loader:
        if y[0] == num:
            # Apply segmentation
            sample = np.squeeze(data.numpy().astype('double'),axis=0)
            segments = slic(sample.reshape(28,28,1), n_segments=NSEGMENTS, compactness=1, sigma=0.1).reshape(1,28,28)
            all_test_mnist_of_label_num.append(sample)
            all_test_segments_of_label_num.append(segments)

    all_test_mnist_of_label_num = np.array(all_test_mnist_of_label_num)
    all_test_segments_of_label_num = np.array(all_test_segments_of_label_num)

    output = {
        "X": all_test_mnist_of_label_num,
        "y": np.ones(all_test_mnist_of_label_num.shape[0]) * num,
        "segments": all_test_segments_of_label_num
    }

    return output

def get_dataset_by_name(inp_image, get_label=True):
    d = get_imagenet(inp_image, get_label=get_label)
    # if name == "compas":
    #     d = get_and_preprocess_compas_data()
    # elif name == "german":
    #     d = get_and_preprocess_german()
    # elif "mnist" in name:
    #     d = get_mnist(int(name[-1]))
    # elif "imagenet" in name:
    #     d = get_imagenet(name[9:], get_label=get_label)
    # else:
    #     raise NameError("Unkown dataset %s", name)
    # d['name'] = name
    return d