# Notebook for data preparation

A.A. 2022-2023 - HUMAN DATA ANALYTICS

Authors:
* Mattia Brocco
* Brenda Eloisa Tellez Juarez

In the following notebook the pipeline for data import, preprocessing and storage (using `.parquet` format) is presented.

In [1]:
from google.colab import drive
drive.mount("/content/drive")

#%cd /content/drive/MyDrive/Environmental-sounds-UNIPD-2022

In [3]:
import os
import sys
import torch
import librosa
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from librosa import display
from scipy.io import wavfile
from tensorflow import keras
import IPython.display as ipd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import evaluation
import CNN_support as cnns
from gng import GrowingNeuralGas

%load_ext autoreload
%autoreload 2

In [4]:
#reading the csv file
data = pd.read_csv('./data/meta/esc50.csv')
data.head()

Unnamed: 0,filename,fold,target,category,esc10,src_file,take
0,1-100032-A-0.wav,1,0,dog,True,100032,A
1,1-100038-A-14.wav,1,14,chirping_birds,False,100038,A
2,1-100210-A-36.wav,1,36,vacuum_cleaner,False,100210,A
3,1-100210-B-36.wav,1,36,vacuum_cleaner,False,100210,B
4,1-101296-A-19.wav,1,19,thunderstorm,False,101296,A


### 2. Data import & preprocessing
With the aim of replicability, the whole pipeline is implemented with the use of `np.random.seed()`.

In [5]:
# DATA AUGMENTATION

#np.random.seed(42)
#indexed_samples = np.random.choice(X.shape[0], size = 10000, replace = True)
np.random.seed(101)
randn_seeds = np.random.choice(len(data), size = len(data), replace = False)

aug_iterations = 7

new_X = []
#new_X2 = []
new_y = np.zeros(shape = (aug_iterations*len(randn_seeds), 1))

input_length = 220500
row_count = 0
for i in data.index:

    sample, sr_sample = librosa.load("./data/audio/{}".format(data.loc[i, "filename"]),
                                     sr = 44100)
    # Min-max scaler [0, 1]
    sample = (sample - sample.min()) / (sample.max() - sample.min())

    if len(sample) > input_length:
        sample = sample[:input_length]
    else:
        sample = np.pad(sample, (0, max(0, input_length - len(sample))), "constant")

    for n in range(aug_iterations):
        
        if n == 0:
            # NOISE INJECTION
            np.random.seed(randn_seeds[i])
            noise = np.random.randn(len( sample ))
            augmented_data = (sample + 0.005 * noise)

        elif n == 1:
            # TIME SHIFT: right shift
            augmented_data = np.roll(sample, 22050)

        elif n == 2:
            # PITCH SHIFT: shift down by 3
            augmented_data = librosa.effects.pitch_shift(y = sample, sr = sr_sample,
                                                         n_steps = 3)
        elif n == 3:
            # PITCH SHIFT: shift down by -3
            augmented_data = librosa.effects.pitch_shift(y = sample, sr = sr_sample,
                                                         n_steps = -3)
        elif n == 4:
            # SPEED SHIFT: faster
            augmented_data = librosa.effects.time_stretch(y = sample, rate = 1.25)
            augmented_data = np.append(augmented_data,
                                       np.zeros(shape = len(sample) - len(augmented_data)))
        elif n == 5:
            # SPEED SHIFT: slower (returns longer array)
            augmented_data = librosa.effects.time_stretch(y = sample, rate = 0.8)
            augmented_data = augmented_data[:len(sample)]

        else:
            # KEEP NORMAL SAMPLE
            augmented_data = sample

        new_instance = librosa.feature.mfcc(y = augmented_data, sr = sr_sample,
                                            hop_length = 512, n_mfcc = 60)
        
        """
        For the CNN, the input is composed of three channels
        stacked together as follows (commented lines).
        """
        #new_MFCC = librosa.feature.mfcc(y = augmented_data, sr = sr_sample,
        #                                hop_length = 512, n_mfcc = 60)
        #new_chromagram = librosa.feature.chroma_stft(y = augmented_data, sr = sr_sample,
        #                                             hop_length = 512, win_length = 1024,
        #                                             n_chroma = 60)
        #new_delta = librosa.feature.delta(new_MFCC)
    
        #new_instance = np.dstack((new_MFCC, new_chromagram, new_delta))

    
        new_X += [new_instance]
        #new_X2 += [new_instance2]
        new_y[row_count] = data.loc[i, "target"]
        
        row_count += 1
        
    
new_X = np.array(new_X)
#new_X2 = np.array(new_X2)



In [6]:
new_X.shape, new_y.shape

((14000, 60, 431, 3), (14000, 1))

In [8]:
# Reduce float precision in order to decrease the size of the files
new_X = new_X.astype(np.float32)

In [2]:
def data_to_parquet(arr, name):
    """
    Whether it is for the CNN or the RNN,
    this function provides a flattening of all the 
    dimensions of the array except the first
    (number of samples).
    
    When required, the files are then imported
    via the 'pandas' library and prperly reshaped.
    """
    if len(arr.shape) > 2:
        arr2 = arr.reshape(arr.shape[0], -1)
        arr2 = pd.DataFrame(arr2)
    else:
        arr2 = pd.DataFrame(arr)

    arr2.columns = [str(c) for c in arr2.columns]
    arr2.to_parquet(os.getcwd() + f"/data/{name}.parquet")
    

data_to_parquet(new_X, "X_CNN_60x431x3_7times")
data_to_parquet(new_y, "y_CNN_7times")

```python
# Get data for RNN
X = []
y = np.zeros(shape = (len(data), 1))

for i in data.index:
    
    sample, sr_sample = librosa.load("./data/audio/{}".format(data.loc[i, "filename"]),
                                     sr = 44100)
    
    MFCC = librosa.feature.mfcc(y = sample, sr = sr_sample,
                                hop_length = 512, n_mfcc = 60)
    
    #instance = MFCC.mean(axis = 0)
    
    X += [MFCC]
    
    y[i] = data.loc[i, "target"]
    
X = np.array(X)
```

### Adversarial attacks

In [None]:
# create an adversarial example
def create_adversarial_example(x2, y_new, model_bidirectional):
    # convert the label to a one-hot encoded vector
    y = tf.keras.utils.to_categorical(y_new, num_classes=50)
# compute the gradient of the loss with respect to the input
    with tf.GradientTape() as tape:
        tape.watch(x2)
        logits = model_bidirectional(x2)
        loss_value = tf.losses.categorical_crossentropy(y_new, logits)
    grads = tape.gradient(loss_value, x2)
# create an adversarial example by adding the sign of the gradient to the input
    epsilon = 0.01
    x_adv = x2 + epsilon * tf.sign(grads)
    x_adv = tf.clip_by_value(x_adv, 0, 1)
    return x_adv

In [None]:
#def create_adversarial_example(x2, y_new, model_bidirectional):
    # convert the label to a one-hot encoded vector
    y = tf.keras.utils.to_categorical(y_new, num_classes=20)
    # compute the gradient of the loss with respect to the input
    logits = model_bidirectional(x2)
    loss = tf.losses.categorical_crossentropy(y_new, logits)
    grads, = tf.gradients(loss, x2)
    # create an adversarial example by adding the sign of the gradient to the input
    epsilon = 0.01
    x_adv = x2 + epsilon * tf.sign(grads)
    x_adv = tf.clip_by_value(x_adv, 0, 1)
    return x_adv

In [None]:
# create an adversarial example and test it with the model
x_adv = create_adversarial_example(x2, y_new, model_bidirectional)
y_pred_adv = model_bidirectional(x_adv).argmax() # get the predicted label
acc = (y_pred_adv == y_new).mean() # calculate the accuracy
print(f'Model accuracy on adversarial example: {acc:.2f}')

In [None]:
# test the adversarial example
x_adv = create_adversarial_example(x2, y_new, model_bidirectional)
logits_adv = model_bidirectional(x_adv)
y_pred_adv = np.argmax(logits_adv, axis=1)
accuracy = accuracy_score(y_new, y_pred_adv)
print('Accuracy on adversarial example:', accuracy)