File size: 2,588 Bytes
248de36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
This code is for testing and demonstration.
Source code for credit: https://huggingface.co/spaces/nithinraok/titanet-speaker-verification/blob/main/app.py
"""

import gradio as gr
import torch
from nemo.collections.asr.models import EncDecSpeakerLabelModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "nvidia/speakerverification_en_titanet_large"
model = EncDecSpeakerLabelModel.from_pretrained(model_name).to(device)

def compare(path1, path2):
    if not (path1 and path2):
        raise gr.Error("Need recordings from both speakers!")

    embs1 = model.get_embedding(path1).squeeze()
    embs2 = model.get_embedding(path2).squeeze()
    
    #Length Normalize
    X = embs1 / torch.linalg.norm(embs1)
    Y = embs2 / torch.linalg.norm(embs2)
    
    # Score
    similarity_score = torch.dot(X, Y) / ((torch.dot(X, X) * torch.dot(Y, Y)) ** 0.5)
    similarity_score = (similarity_score + 1) / 2
    
    # # Decision
    # if similarity_score >= THRESHOLD:
    #     return OUTPUT_OK.format(similarity_score * 100)
    # else:
    #     return OUTPUT_FAIL.format(similarity_score * 100)
    return "{:.4f}".format(similarity_score.item())


inputs = [
    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #1"),
    gr.inputs.Audio(source="microphone", type="filepath", optional=True, label="Speaker #2"),
]

upload_inputs = [
    gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #1"),
    gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Speaker #2"),
]

description = (
    "The purpose of this demo is to show how VoID could work with speech embeddings rather than mel spectograms.\n"
    "This demonstration will analyze two recordings of speech and ascertain whether they have been spoken by the same individual.\n"
    "You can attempt this exercise using your own voice."
)

title="VoID with TitaNet Embeddings"

microphone_interface = gr.Interface(
    fn=compare,
    inputs=inputs,
    outputs="text",
    title=title,
    description=description,
    layout="horizontal",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    # examples=examples,
)

upload_interface = gr.Interface(
    fn=compare,
    inputs=upload_inputs,
    outputs="text",
    title=title,
    description=description,
    layout="horizontal",
    theme="huggingface",
    allow_flagging=False,
    live=False,
    # examples=examples,
)

demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

demo.launch()