vits-simple-api-gsv / bert_vits2 /clap_wrapper.py
Artrajz's picture
init
960cd20
raw
history blame
552 Bytes
import torch
def get_clap_audio_feature(audio_data, clap_model, processor, device):
with torch.no_grad():
inputs = processor(
audios=audio_data, return_tensors="pt", sampling_rate=48000
).to(device)
emb = clap_model.get_audio_features(**inputs).float()
return emb.T
def get_clap_text_feature(text, clap_model, processor, device):
with torch.no_grad():
inputs = processor(text=text, return_tensors="pt").to(device)
emb = clap_model.get_text_features(**inputs).float()
return emb.T