import torch import torchaudio import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To access VoiceBox class #import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class import numpy as np from voicebox.src.constants import PPG_PRETRAINED_PATH from voicebox.src.models import ResNetSE34V2 #Set voicebox default parameters LOOKAHEAD = 5 voicebox_kwargs={'win_length': 256, 'ppg_encoder_hidden_size': 256, 'use_phoneme_encoder': True, 'use_pitch_encoder': True, 'use_loudness_encoder': True, 'spec_encoder_lookahead_frames': 0, 'spec_encoder_type': 'mel', 'spec_encoder_mlp_depth': 2, 'bottleneck_lookahead_frames': LOOKAHEAD, 'ppg_encoder_path': PPG_PRETRAINED_PATH, 'n_bands': 128, 'spec_encoder_hidden_size': 512, 'bottleneck_skip': True, 'bottleneck_hidden_size': 512, 'bottleneck_feedforward_size': 512, 'bottleneck_type': 'lstm', 'bottleneck_depth': 2, 'control_eps': 0.5, 'projection_norm': float('inf'), 'conditioning_dim': 512} ''' #Set streamer default parameters: config_path = 'voicebox/pretrained/voicebox/voicebox_final.yaml' with open(config_path) as f: config = yaml.safe_load(f) #Load pretrained model (streamer): model = streamer.VoiceBoxStreamer(**config) model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True) model.eval() ''' #Load pretrained model (VoiceBox): model = vb.VoiceBox(**voicebox_kwargs) model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True) model.eval() #Define function to convert final audio format: def float32_to_int16(waveform): waveform = waveform / np.abs(waveform).max() waveform = waveform * 32767 waveform = waveform.astype(np.int16) waveform = waveform.ravel() return waveform def get_embedding(recording): resnet = ResNetSE34V2(nOut=512, encoder_type='ASP') recording = recording.view(1, -1) embedding = resnet(recording) return embedding #Define predict function: def predict(inp): #How to transform audio from string to tensor waveform, sample_rate = torchaudio.load(inp) #Resample to 16kHz transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = transform_to_16hz(waveform) sample_rate = 16000 #Get speaker embedding condition_tensor = get_embedding(waveform) condition_tensor = condition_tensor.reshape(1, 1, -1) n_frames = waveform.shape[1] condition_tensor = condition_tensor.repeat(1, n_frames, 1) #Run model without changing weights with torch.no_grad(): waveform = model(x=waveform, y=condition_tensor) #Transform output audio into gradio-readable format waveform = waveform.numpy() waveform = float32_to_int16(waveform) return sample_rate, waveform #Set up gradio interface import gradio as gr interface = gr.Interface( fn=predict, inputs=gr.Audio(type="filepath"), outputs=gr.Audio() ) interface.launch()