nithinraok's picture
Update app.py
40890a4 verified
raw
history blame
4.31 kB
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from nemo.collections.asr.models import EncDecRNNTBPEModel
from nemo.collections.asr.models import EncDecSpeakerLabelModel
import gradio as gr
import pandas as pd
import torch
import json
from omegaconf import OmegaConf
import uuid
device = "cuda" if torch.cuda.is_available() else "cpu"
model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
model.eval()
def run_diarization(path1):
annotation = model(path1, num_workers=0, batch_size=16)
rttm=annotation.to_rttm()
df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
lines = rttm.splitlines()
if len(lines) == 0:
df.loc[0] = 0, 0, 'No speaker found'
return df
start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
end_time = float(start_time) + float(duration)
df.loc[0] = start_time, end_time, prev_speaker, ''
for line in lines[1:]:
split = line.split()
start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
end_time = float(start_time) + float(duration)
if cur_speaker == prev_speaker:
df.loc[df.index[-1], 'end_time'] = end_time
else:
df.loc[len(df)] = start_time, end_time, cur_speaker, ''
prev_speaker = cur_speaker
hyp = get_transcripts(df, path1)
assert len(hyp) == len(df)
for i in range(len(df)):
df.loc[i, 'text'] = hyp[i]
return df
def create_manifest(df,audio_path):
filename = '/tmp/' + str(uuid.uuid4()) + '.json'
with open(filename, 'w') as f:
for i in range(len(df)):
start_time = df.iloc[i]['start_time']
end_time = df.iloc[i]['end_time']
speaker = df.iloc[i]['speaker']
dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
json.dump(dic, f)
f.write('\n')
return filename
def get_transcripts(df, audio_path):
filename = create_manifest(df,audio_path)
model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
model.eval()
return model.transcribe(filename, batch_size=2)
article = (
"<p style='text-align: center'>"
"<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>πŸŽ™οΈ Learn more about MSDD model</a> | "
"<a href='https://arxiv.org/abs/2203.15974' target='_blank'>πŸ“š MSDD paper</a> | "
"<a href='https://github.com/NVIDIA/NeMo' target='_blank'>πŸ§‘β€πŸ’» Repository</a>"
"</p>"
)
examples = [
["data/conversation.wav"],
["data/id10270_5r0dWxy17C8-00001.wav"],
]
microphone_interface = gr.Interface(
fn=run_diarization,
inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")],
outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
title="Offline Speaker Diarization with NeMo",
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
article=article,
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
upload_interface = gr.Interface(
fn=run_diarization,
inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')],
outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
title="Offline Speaker Diarization with NeMo",
description="This demonstration will perform offline speaker diarization on an audio file using nemo",
article=article,
theme="huggingface",
allow_flagging=False,
live=False,
examples=examples,
)
demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])
demo.queue(max_size=2, default_concurrency_limit=1)
demo.launch()