import os |
import sys |
import librosa |
import numpy as np |
from scipy.io import wavfile |
from sklearn.preprocessing import normalize |
class SoundPreprocessing: |
""" |
Parameters |
---------- |
sr (int): sampling rate |
max_size (iterable): resulting shape of the tensor |
n_fft (int): number related to FFT |
n_mfcc (int): number of MFCC |
""" |
def __init__(self, *, sr, max_size, n_fft, n_mfcc = 60, hop_length = 512): |
self.sr = sr |
self.n_fft = n_fft |
self.n_mfcc = n_mfcc |
self.max_size = max_size |
self.hop_length = hop_length |
def padding(self, array, xx, yy): |
""" |
Parameters |
---------- |
array: numpy array |
xx: desired height |
yy: desirex width |
Returns: padded array |
""" |
self.array = array |
self.xx = xx |
self.yy = yy |
h = array.shape[0] |
w = array.shape[1] |
a = max((xx - h) // 2,0) |
aa = max(0,xx - a - h) |
b = max(0,(yy - w) // 2) |
bb = max(yy - b - w,0) |
return np.pad(array, pad_width = ((a, aa), (b, bb)), |
mode = "constant") |
def generate_features(self, y_cut, sr, max_size, n_fft, n_mfcc, hop_length): |
self.y_cut = y_cut |
condition = np.arange(2, 1000)[np.where((np.arange(2, 1000) - 2)%14 == 0)] |
global shape_changed |
shape_changed = False |
if max_size[0] not in condition: |
new_max0 = sorted(condition, key = lambda v: abs(v - max_size[0]))[0] |
shape_changed = True |
max_size = (new_max0, max_size[1]) |
stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft, |
hop_length = 512)), max_size[0], max_size[1]) |
if max_size[0] < stft.shape[0]: |
new_max0 = sorted(condition[condition >= stft.shape[0]], |
key = lambda v: abs(v - stft.shape[0]))[0] |
max_size = (new_max0, max_size[1]) |
shape_changed = True |
stft = self.padding(np.abs(librosa.stft(y = y_cut, n_fft = n_fft, |
hop_length = 512)), max_size[0], max_size[1]) |
MFCCs = self.padding(librosa.feature.mfcc(y = y_cut, n_fft = n_fft, sr = sr, |
hop_length = hop_length, n_mfcc = n_mfcc), |
max_size[0], max_size[1]) |
spec_centroid = librosa.feature.spectral_centroid(y = y_cut, sr = sr) |
chroma_stft = librosa.feature.chroma_stft(y = y_cut, sr = sr) |
spec_bw = librosa.feature.spectral_bandwidth(y = y_cut, sr = sr) |
image = np.array([self.padding(normalize(spec_bw), 1, max_size[1])]).reshape(1, max_size[1]) |
image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0) |
for i in range( int((max_size[0]-2)/14) ): |
image = np.append(image, self.padding(normalize(spec_bw), 1, max_size[1]), axis = 0) |
image = np.append(image, self.padding(normalize(spec_centroid), 1, max_size[1]), axis = 0) |
image = np.append(image, self.padding(normalize(chroma_stft), 12, max_size[1]), axis = 0) |
image = np.dstack((image, np.abs(stft))) |
image = np.dstack((image, MFCCs)) |
return image |
def get_features(self, df, filepath): |
self.df = df |
self.filepath = filepath |
X = [] |
y = np.zeros(shape = (len(df), 1)) |
for i in df.index: |
sr_i, aud = wavfile.read("{}\\{}".format(filepath, df.loc[i, "filename"])) |
aud = aud.astype(np.float16) |
X += [self.generate_features(y_cut = aud, sr = sr_i, |
n_fft = self.n_fft, |
n_mfcc = self.n_mfcc, |
max_size = self.max_size, |
hop_length = self.hop_length)] |
y[i] = df.loc[i, "target"] |
if shape_changed == True: |
print(f"New max_size is {max_size}") |
X = np.array(X) |
return X, y |