Spaces:
Build error
Build error
from nemo.collections.asr.models.msdd_models import NeuralDiarizer | |
from nemo.collections.asr.models import EncDecRNNTBPEModel | |
from nemo.collections.asr.models import EncDecSpeakerLabelModel | |
import gradio as gr | |
import pandas as pd | |
import torch | |
import json | |
from omegaconf import OmegaConf | |
import uuid | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device) | |
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device) | |
model.eval() | |
def run_diarization(path1): | |
annotation = model(path1, num_workers=0, batch_size=16) | |
rttm=annotation.to_rttm() | |
df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text']) | |
lines = rttm.splitlines() | |
if len(lines) == 0: | |
df.loc[0] = 0, 0, 'No speaker found' | |
return df | |
start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7] | |
end_time = float(start_time) + float(duration) | |
df.loc[0] = start_time, end_time, prev_speaker, '' | |
for line in lines[1:]: | |
split = line.split() | |
start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7] | |
end_time = float(start_time) + float(duration) | |
if cur_speaker == prev_speaker: | |
df.loc[df.index[-1], 'end_time'] = end_time | |
else: | |
df.loc[len(df)] = start_time, end_time, cur_speaker, '' | |
prev_speaker = cur_speaker | |
hyp = get_transcripts(df, path1) | |
assert len(hyp) == len(df) | |
for i in range(len(df)): | |
df.loc[i, 'text'] = hyp[i] | |
return df | |
def create_manifest(df,audio_path): | |
filename = '/tmp/' + str(uuid.uuid4()) + '.json' | |
with open(filename, 'w') as f: | |
for i in range(len(df)): | |
start_time = df.iloc[i]['start_time'] | |
end_time = df.iloc[i]['end_time'] | |
speaker = df.iloc[i]['speaker'] | |
dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time} | |
json.dump(dic, f) | |
f.write('\n') | |
return filename | |
def get_transcripts(df, audio_path): | |
filename = create_manifest(df,audio_path) | |
model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device) | |
model.eval() | |
return model.transcribe(filename, batch_size=2) | |
article = ( | |
"<p style='text-align: center'>" | |
"<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>ποΈ Learn more about MSDD model</a> | " | |
"<a href='https://arxiv.org/abs/2203.15974' target='_blank'>π MSDD paper</a> | " | |
"<a href='https://github.com/NVIDIA/NeMo' target='_blank'>π§βπ» Repository</a>" | |
"</p>" | |
) | |
examples = [ | |
["data/conversation.wav"], | |
["data/id10270_5r0dWxy17C8-00001.wav"], | |
] | |
microphone_interface = gr.Interface( | |
fn=run_diarization, | |
inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")], | |
outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition', | |
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])], | |
title="Offline Speaker Diarization with NeMo", | |
description="This demonstration will perform offline speaker diarization on an audio file using nemo", | |
article=article, | |
theme="huggingface", | |
allow_flagging=False, | |
live=False, | |
examples=examples, | |
) | |
upload_interface = gr.Interface( | |
fn=run_diarization, | |
inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')], | |
outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition', | |
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])], | |
title="Offline Speaker Diarization with NeMo", | |
description="This demonstration will perform offline speaker diarization on an audio file using nemo", | |
article=article, | |
theme="huggingface", | |
allow_flagging=False, | |
live=False, | |
examples=examples, | |
) | |
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"]) | |
demo.queue(max_size=2, default_concurrency_limit=1) | |
demo.launch() |