Spaces:
Runtime error
Runtime error
# Copyright (c) 2023 Amphion. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import os | |
import numpy as np | |
import soundfile as sf | |
import torch | |
import torch.nn.functional as F | |
from tqdm import tqdm | |
import librosa | |
from .models.RawNetModel import RawNet3 | |
from .models.RawNetBasicBlock import Bottle2neck | |
def extract_speaker_embd( | |
model, fn: str, n_samples: int, n_segments: int = 10, gpu: bool = False | |
) -> np.ndarray: | |
audio, sample_rate = sf.read(fn) | |
if len(audio.shape) > 1: | |
raise ValueError( | |
f"RawNet3 supports mono input only. Input data has a shape of {audio.shape}." | |
) | |
if sample_rate != 16000: | |
# resample to 16000kHz | |
audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) | |
# print("resample to 16000kHz!") | |
if len(audio) < n_samples: # RawNet3 was trained using utterances of 3 seconds | |
shortage = n_samples - len(audio) + 1 | |
audio = np.pad(audio, (0, shortage), "wrap") | |
audios = [] | |
startframe = np.linspace(0, len(audio) - n_samples, num=n_segments) | |
for asf in startframe: | |
audios.append(audio[int(asf) : int(asf) + n_samples]) | |
audios = torch.from_numpy(np.stack(audios, axis=0).astype(np.float32)) | |
if gpu: | |
audios = audios.to("cuda") | |
with torch.no_grad(): | |
output = model(audios) | |
return output | |
def extract_speaker_similarity(target_path, reference_path): | |
model = RawNet3( | |
Bottle2neck, | |
model_scale=8, | |
context=True, | |
summed=True, | |
encoder_type="ECA", | |
nOut=256, | |
out_bn=False, | |
sinc_stride=10, | |
log_sinc=True, | |
norm_sinc="mean", | |
grad_mult=1, | |
) | |
gpu = False | |
model.load_state_dict( | |
torch.load( | |
"pretrained/rawnet3/model.pt", | |
map_location=lambda storage, loc: storage, | |
)["model"] | |
) | |
model.eval() | |
print("RawNet3 initialised & weights loaded!") | |
if torch.cuda.is_available(): | |
print("Cuda available, conducting inference on GPU") | |
model = model.to("cuda") | |
gpu = True | |
# for target_path, reference_path in zip(target_paths, ref_paths): | |
# print(f"Extracting embeddings for target singers...") | |
target_embeddings = [] | |
for file in tqdm(os.listdir(target_path)): | |
output = extract_speaker_embd( | |
model, | |
fn=os.path.join(target_path, file), | |
n_samples=48000, | |
n_segments=10, | |
gpu=gpu, | |
).mean(0) | |
target_embeddings.append(output.detach().cpu().numpy()) | |
target_embeddings = np.array(target_embeddings) | |
target_embedding = np.mean(target_embeddings, axis=0) | |
# print(f"Extracting embeddings for reference singer...") | |
reference_embeddings = [] | |
for file in tqdm(os.listdir(reference_path)): | |
output = extract_speaker_embd( | |
model, | |
fn=os.path.join(reference_path, file), | |
n_samples=48000, | |
n_segments=10, | |
gpu=gpu, | |
).mean(0) | |
reference_embeddings.append(output.detach().cpu().numpy()) | |
reference_embeddings = np.array(reference_embeddings) | |
# print("Calculating cosine similarity...") | |
cos_sim = F.cosine_similarity( | |
torch.from_numpy(np.mean(target_embeddings, axis=0)).unsqueeze(0), | |
torch.from_numpy(np.mean(reference_embeddings, axis=0)).unsqueeze(0), | |
dim=1, | |
) | |
# print(f"Mean cosine similarity: {cos_sim.item()}") | |
return cos_sim.item() | |