Spaces:
Running
Running
File size: 1,198 Bytes
f82071f 537486f f82071f 537486f f82071f 537486f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
import torch
from vencoder.encoder import SpeechEncoder
from vencoder.whisper.audio import log_mel_spectrogram, pad_or_trim
from vencoder.whisper.model import ModelDimensions, Whisper
class WhisperPPG(SpeechEncoder):
def __init__(self, vec_path="pretrain/medium.pt", device=None):
super().__init__()
if device is None:
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
else:
self.dev = torch.device(device)
checkpoint = torch.load(vec_path, map_location=device)
dims = ModelDimensions(**checkpoint["dims"])
model = Whisper(dims)
model.load_state_dict(checkpoint["model_state_dict"])
self.hidden_dim = dims
self.model = model.to(self.dev)
def encoder(self, wav):
audio = wav
audln = audio.shape[0]
ppgln = audln // 320
audio = pad_or_trim(audio)
mel = log_mel_spectrogram(audio).to(self.dev)
with torch.no_grad():
ppg = self.model.encoder(mel.unsqueeze(0)).squeeze().data.cpu().float().numpy()
ppg = torch.FloatTensor(ppg[:ppgln, ]).to(self.dev)
return ppg[None, :, :].transpose(1, 2)
|