Spaces:
Sleeping
Sleeping
import torch | |
import torchaudio | |
import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class | |
#import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class | |
import numpy as np | |
from voicebox.src.constants import PPG_PRETRAINED_PATH | |
#Set voicebox default parameters | |
LOOKAHEAD = 5 | |
voicebox_kwargs={'win_length': 256, | |
'ppg_encoder_hidden_size': 256, | |
'use_phoneme_encoder': True, | |
'use_pitch_encoder': True, | |
'use_loudness_encoder': True, | |
'spec_encoder_lookahead_frames': 0, | |
'spec_encoder_type': 'mel', | |
'spec_encoder_mlp_depth': 2, | |
'bottleneck_lookahead_frames': LOOKAHEAD, | |
'ppg_encoder_path': PPG_PRETRAINED_PATH, | |
'n_bands': 128, | |
'spec_encoder_hidden_size': 512, | |
'bottleneck_skip': True, | |
'bottleneck_hidden_size': 512, | |
'bottleneck_feedforward_size': 512, | |
'bottleneck_type': 'lstm', | |
'bottleneck_depth': 2, | |
'control_eps': 0.5, | |
'projection_norm': float('inf'), | |
'conditioning_dim': 512} | |
#Load pretrained model: | |
model = vb.VoiceBox(**voicebox_kwargs) | |
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True) | |
model.eval() | |
#Define function to convert final audio format: | |
def float32_to_int16(waveform): | |
waveform = waveform / np.abs(waveform).max() | |
waveform = waveform * 32767 | |
waveform = waveform.astype(np.int16) | |
waveform = waveform.ravel() | |
return waveform | |
#Define predict function: | |
def predict(inp): | |
#How to transform audio from string to tensor | |
waveform, sample_rate = torchaudio.load(inp) | |
#Run model without changing weights | |
with torch.no_grad(): | |
waveform = model(waveform) | |
#Transform output audio into gradio-readable format | |
waveform = waveform.numpy() | |
waveform = float32_to_int16(waveform) | |
return sample_rate, waveform | |
#Set up gradio interface | |
import gradio as gr | |
interface = gr.Interface( | |
fn=predict, | |
inputs=gr.Audio(type="filepath"), | |
outputs=gr.Audio() | |
) | |
interface.launch() |