Spaces:
Runtime error
Runtime error
from huggingface_hub import from_pretrained_keras | |
import numpy as np | |
import tensorflow as tf | |
from tensorflow.keras import layers | |
import tensorflow_io as tfio | |
import gradio as gr | |
import librosa | |
import librosa.display | |
import matplotlib.pyplot as plt | |
class MelSpec(layers.Layer): | |
def __init__( | |
self, | |
frame_length=1024, | |
frame_step=256, | |
fft_length=None, | |
sampling_rate=22050, | |
num_mel_channels=80, | |
freq_min=125, | |
freq_max=7600, | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
self.frame_length = frame_length | |
self.frame_step = frame_step | |
self.fft_length = fft_length | |
self.sampling_rate = sampling_rate | |
self.num_mel_channels = num_mel_channels | |
self.freq_min = freq_min | |
self.freq_max = freq_max | |
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix( | |
num_mel_bins=self.num_mel_channels, | |
num_spectrogram_bins=self.frame_length // 2 + 1, | |
sample_rate=self.sampling_rate, | |
lower_edge_hertz=self.freq_min, | |
upper_edge_hertz=self.freq_max, | |
) | |
def call(self, audio): | |
stft = tf.signal.stft( | |
tf.squeeze(audio, -1), | |
self.frame_length, | |
self.frame_step, | |
self.fft_length, | |
pad_end=True, | |
) | |
# Taking the magnitude of the STFT output | |
magnitude = tf.abs(stft) | |
# Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale | |
mel = tf.matmul(tf.square(magnitude), self.mel_filterbank) | |
log_mel_spec = tfio.audio.dbscale(mel, top_db=80) | |
return log_mel_spec | |
def get_config(self): | |
config = super(MelSpec, self).get_config() | |
config.update( | |
{ | |
"frame_length": self.frame_length, | |
"frame_step": self.frame_step, | |
"fft_length": self.fft_length, | |
"sampling_rate": self.sampling_rate, | |
"num_mel_channels": self.num_mel_channels, | |
"freq_min": self.freq_min, | |
"freq_max": self.freq_max, | |
} | |
) | |
return config | |
model = from_pretrained_keras("keras-io/MelGAN-spectrogram-inversion") | |
def inference(audio, model): | |
input, sr = librosa.load(audio) | |
# input, sr = audio | |
x = tf.expand_dims(input, axis=-1) | |
mel = MelSpec()(x) | |
audio_sample = tf.expand_dims(mel, axis=0) | |
pred = model.predict(audio_sample, batch_size=1, verbose=0) | |
return input, pred.squeeze(), sr | |
def predict(audio): | |
x, x_pred, sr = inference(audio, model) | |
fig, ax = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(10, 8), dpi=120) | |
D = librosa.amplitude_to_db(np.abs(librosa.stft(x)), ref=np.max) | |
img = librosa.display.specshow(D, y_axis='linear', x_axis='time', | |
sr=sr, ax=ax[0]) | |
ax[0].set(title='Spectrogram of Original sample audio') | |
ax[0].label_outer() | |
D = librosa.amplitude_to_db(np.abs(librosa.stft(x_pred)), ref=np.max) | |
img = librosa.display.specshow(D, y_axis='linear', x_axis='time', | |
sr=sr, ax=ax[1]) | |
ax[1].set(title='Spectrogram of synthesis sample audio ') | |
ax[1].label_outer() | |
return plt.gcf() | |
inputs = [ | |
gr.Audio(source = "upload", label='Upload audio file', type="filepath"), | |
] | |
examples = ['sample_1.wav', 'sample_2.wav'] | |
gr.Interface( | |
fn=predict, | |
title="MelGAN-based spectrogram inversion", | |
description = "Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching", | |
inputs=inputs, | |
examples=examples, | |
outputs=gr.Plot(), | |
article = "Author: <a href=\"https://huggingface.co/vumichien\">Vu Minh Chien</a>. Based on the keras example from <a href=\"https://keras.io/examples/audio/melgan_spectrogram_inversion/\">Darshan Deshpande</a>", | |
).launch(debug=False, enable_queue=True) |