Spaces:
Build error
Build error
File size: 4,310 Bytes
d9f919a d68b1ee 81d7107 e1aefcd d68b1ee 81d7107 feba9aa d68b1ee 81d7107 9b7eef6 feba9aa d68b1ee feba9aa d68b1ee feba9aa d68b1ee 40890a4 d68b1ee 81d7107 1ac7d5c 81d7107 1a28529 fb5c3ca 81d7107 a0314cc 81d7107 5773ebb 81d7107 a0314cc 81d7107 5773ebb a0314cc 5773ebb a0314cc 5773ebb a0314cc e5573db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd
import torch
import json
from omegaconf import OmegaConf
import uuid
device = "cuda" if torch.cuda.is_available() else "cpu"
model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()
def run_diarization(path1):
annotation = model(path1, num_workers=0, batch_size=16)
rttm=annotation.to_rttm()
df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
lines = rttm.splitlines()
if len(lines) == 0:
df.loc[0] = 0, 0, 'No speaker found'
return df
start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
end_time = float(start_time) + float(duration)
df.loc[0] = start_time, end_time, prev_speaker, ''
for line in lines[1:]:
split = line.split()
start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
end_time = float(start_time) + float(duration)
if cur_speaker == prev_speaker:
df.loc[df.index[-1], 'end_time'] = end_time
else:
df.loc[len(df)] = start_time, end_time, cur_speaker, ''
prev_speaker = cur_speaker
hyp = get_transcripts(df, path1)
assert len(hyp) == len(df)
for i in range(len(df)):
df.loc[i, 'text'] = hyp[i]
return df
def create_manifest(df,audio_path):
filename = '/tmp/' + str(uuid.uuid4()) + '.json'
with open(filename, 'w') as f:
for i in range(len(df)):
start_time = df.iloc[i]['start_time']
end_time = df.iloc[i]['end_time']
speaker = df.iloc[i]['speaker']
dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
json.dump(dic, f)
f.write('\n')
return filename
def get_transcripts(df, audio_path):
filename = create_manifest(df,audio_path)
model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
model.eval()
return model.transcribe(filename, batch_size=2)
article = (
"<p style='text-align: center'>"
"<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>ποΈ Learn more about MSDD model</a> | "
"<a href='https://arxiv.org/abs/2203.15974' target='_blank'>π MSDD paper</a> | "
"<a href='https://github.com/NVIDIA/NeMo' target='_blank'>π§βπ» Repository</a>"
"</p>"
)
examples = [
["data/conversation.wav"],
["data/id10270_5r0dWxy17C8-00001.wav"],
]
microphone_interface = gr.Interface(
fn=run_diarization,
inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")],
outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
title="Offline Speaker Diarization with NeMo",
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
article=article,
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
upload_interface = gr.Interface(
fn=run_diarization,
inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')],
outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
title="Offline Speaker Diarization with NeMo",
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
article=article,
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
demo.queue(max_size=2, default_concurrency_limit=1)
demo.launch() |