Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

File size: 599 Bytes

565faca

import librosa
import numpy as np

mel_window_length = 25
mel_window_step = 10
mel_n_channels = 40
sampling_rate = 16000


def wav_to_mel_spectrogram(wav):
    """
    Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
    Note: this not a log-mel spectrogram.
    """
    frames = librosa.feature.melspectrogram(
        y=wav,
        sr=sampling_rate,
        n_fft=int(sampling_rate * mel_window_length / 1000),
        hop_length=int(sampling_rate * mel_window_step / 1000),
        n_mels=mel_n_channels,
    )
    return frames.astype(np.float32).T