Spaces:

kotoba-tech
/

kotoba-speech

Running on T4

yuta0306

first commit

565faca 8 months ago

599 Bytes

	import librosa
	import numpy as np

	mel_window_length = 25
	mel_window_step = 10
	mel_n_channels = 40
	sampling_rate = 16000


	def wav_to_mel_spectrogram(wav):
	"""
	Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
	Note: this not a log-mel spectrogram.
	"""
	frames = librosa.feature.melspectrogram(
	y=wav,
	sr=sampling_rate,
	n_fft=int(sampling_rate * mel_window_length / 1000),
	hop_length=int(sampling_rate * mel_window_step / 1000),
	n_mels=mel_n_channels,
	)
	return frames.astype(np.float32).T