import librosa import numpy as np mel_window_length = 25 mel_window_step = 10 mel_n_channels = 40 sampling_rate = 16000 def wav_to_mel_spectrogram(wav): """ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. Note: this not a log-mel spectrogram. """ frames = librosa.feature.melspectrogram( y=wav, sr=sampling_rate, n_fft=int(sampling_rate * mel_window_length / 1000), hop_length=int(sampling_rate * mel_window_step / 1000), n_mels=mel_n_channels, ) return frames.astype(np.float32).T