File size: 4,310 Bytes
d9f919a
d68b1ee
 
81d7107
e1aefcd
 
d68b1ee
 
 
81d7107
 
 
feba9aa
d68b1ee
 
81d7107
 
9b7eef6
feba9aa
d68b1ee
 
 
 
 
 
 
 
 
 
feba9aa
d68b1ee
feba9aa
d68b1ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40890a4
d68b1ee
81d7107
 
 
1ac7d5c
 
81d7107
 
 
 
1a28529
fb5c3ca
81d7107
 
a0314cc
81d7107
5773ebb
 
 
81d7107
a0314cc
81d7107
 
 
 
 
5773ebb
a0314cc
 
 
5773ebb
 
 
a0314cc
 
 
 
 
 
 
5773ebb
a0314cc
 
 
e5573db
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd 
import torch
import json
from omegaconf import OmegaConf
import uuid

device = "cuda" if torch.cuda.is_available() else "cpu"

model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()

def run_diarization(path1):
    annotation = model(path1, num_workers=0, batch_size=16)
    rttm=annotation.to_rttm()
    df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
    lines = rttm.splitlines()
    if len(lines) == 0:
        df.loc[0] = 0, 0, 'No speaker found'
        return df
    start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
    end_time = float(start_time) + float(duration)
    df.loc[0] = start_time, end_time, prev_speaker, ''

    for line in lines[1:]:
        split = line.split()
        start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
        end_time = float(start_time) + float(duration)
        if cur_speaker == prev_speaker:
            df.loc[df.index[-1], 'end_time'] = end_time
        else:
            df.loc[len(df)] = start_time, end_time, cur_speaker, ''
        prev_speaker = cur_speaker
    
    hyp = get_transcripts(df, path1)

    assert len(hyp) == len(df)

    for i in range(len(df)):
        df.loc[i, 'text'] = hyp[i]

    return df

def create_manifest(df,audio_path):

    filename = '/tmp/' + str(uuid.uuid4()) + '.json'
    with open(filename, 'w') as f:
        for i in range(len(df)):
            start_time = df.iloc[i]['start_time']
            end_time = df.iloc[i]['end_time']
            speaker = df.iloc[i]['speaker']
            dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
            json.dump(dic, f)
            f.write('\n')

    return filename

def get_transcripts(df, audio_path):
    
    filename = create_manifest(df,audio_path)
    model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
    model.eval()
    return model.transcribe(filename, batch_size=2)
    

article = (
    "<p style='text-align: center'>"
    "<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>πŸŽ™οΈ Learn more about MSDD model</a> | "
    "<a href='https://arxiv.org/abs/2203.15974' target='_blank'>πŸ“š MSDD paper</a> | "
    "<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» Repository</a>"
    "</p>"
)
examples = [
    ["data/conversation.wav"],
    ["data/id10270_5r0dWxy17C8-00001.wav"],
]

microphone_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")],
    outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
        row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
    )   

upload_interface = gr.Interface(
    fn=run_diarization,
    inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')],
    outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
        row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
    title="Offline Speaker Diarization with NeMo",
    description="This demonstration will perform offline speaker diarization on an audio file using nemo",
    article=article,
    theme="huggingface",
    allow_flagging=False,
    live=False,
    examples=examples,
    )

demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

demo.queue(max_size=2, default_concurrency_limit=1)
demo.launch()