Spaces:

nithinraok
/

NeMo-Offline-Speaker-Diarization

Runtime error

App Files Files Community

NeMo-Offline-Speaker-Diarization / app.py

nithinraok

Update app.py

5773ebb over 1 year ago

raw

history blame contribute delete

No virus

5.07 kB

	from nemo.collections.asr.models.msdd_models import NeuralDiarizer
	from nemo.collections.asr.models import EncDecRNNTBPEModel
	from nemo.collections.asr.models import EncDecSpeakerLabelModel
	import gradio as gr
	import pandas as pd
	import torch
	import json
	from omegaconf import OmegaConf
	import uuid

	device = "cuda" if torch.cuda.is_available() else "cpu"

	model = NeuralDiarizer.from_pretrained("diar_msdd_telephonic").to(device)
	speaker_model = EncDecSpeakerLabelModel.from_pretrained("nvidia/speakerverification_en_titanet_large").to(device)
	model.eval()

	def run_diarization(path1):
	annotation = model(path1, num_workers=0, batch_size=16)
	rttm=annotation.to_rttm()
	df = pd.DataFrame(columns=['start_time', 'end_time', 'speaker', 'text'])
	lines = rttm.splitlines()
	if len(lines) == 0:
	df.loc[0] = 0, 0, 'No speaker found'
	return df
	start_time, duration, prev_speaker = float(lines[0].split()[3]), float(lines[0].split()[4]), lines[0].split()[7]
	end_time = float(start_time) + float(duration)
	df.loc[0] = start_time, end_time, prev_speaker, ''

	for line in lines[1:]:
	split = line.split()
	start_time, duration, cur_speaker = float(split[3]), float(split[4]), split[7]
	end_time = float(start_time) + float(duration)
	if cur_speaker == prev_speaker:
	df.loc[df.index[-1], 'end_time'] = end_time
	else:
	df.loc[len(df)] = start_time, end_time, cur_speaker, ''
	prev_speaker = cur_speaker

	hyp = get_transcripts(df, path1)

	assert len(hyp) == len(df)

	for i in range(len(df)):
	df.loc[i, 'text'] = hyp[i]

	return df

	def create_manifest(df,audio_path):

	filename = '/tmp/' + str(uuid.uuid4()) + '.json'
	with open(filename, 'w') as f:
	for i in range(len(df)):
	start_time = df.iloc[i]['start_time']
	end_time = df.iloc[i]['end_time']
	speaker = df.iloc[i]['speaker']
	dic = {"audio_filepath": audio_path, "duration": end_time-start_time, "label": speaker, "offset": start_time}
	json.dump(dic, f)
	f.write('\n')

	return filename

	def get_transcripts(df, audio_path):

	filename = create_manifest(df,audio_path)
	model = EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/stt_en_fastconformer_transducer_large").to(device)
	model.eval()
	config = OmegaConf.create({"manifest_filepath": filename, 'batch_size': 2})
	dataloader = model._setup_transcribe_dataloader(config)

	hypotheses = []
	all_hypotheses = []

	for test_batch in (dataloader):
	encoded, encoded_len = model.forward(
	input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device)
	)
	best_hyp, all_hyp = model.decoding.rnnt_decoder_predictions_tensor(
	encoded,
	encoded_len,
	return_hypotheses=False,
	partial_hypotheses=None,)

	hypotheses += best_hyp
	if all_hyp is not None:
	all_hypotheses += all_hyp
	else:
	all_hypotheses += best_hyp

	del encoded
	del test_batch

	return hypotheses

	article = (
	"<p style='text-align: center'>"
	"<a href='https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/diar_msdd_telephonic' target='_blank'>🎙️ Learn more about MSDD model</a> \| "
	"<a href='https://arxiv.org/abs/2203.15974' target='_blank'>📚 MSDD paper</a> \| "
	"<a href='https://github.com/NVIDIA/NeMo' target='_blank'>🧑‍💻 Repository</a>"
	"</p>"
	)
	examples = [
	["data/conversation.wav"],
	["data/id10270_5r0dWxy17C8-00001.wav"],
	]

	microphone_interface = gr.Interface(
	fn=run_diarization,
	inputs=[gr.Audio(source="microphone", type="filepath", label="Mic Audio")],
	outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
	row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
	title="Offline Speaker Diarization with NeMo",
	description="This demonstration will perform offline speaker diarization on an audio file using nemo",
	article=article,
	layout="vertical",
	theme="huggingface",
	allow_flagging=False,
	live=False,
	examples=examples,
	)

	upload_interface = gr.Interface(
	fn=run_diarization,
	inputs=[gr.Audio(source="upload", type='filepath', label='Upload File')],
	outputs=[gr.components.Dataframe(wrap=True, label='Speaker Diariazation with Speech Recognition',
	row_count=(1, "dynamic"), headers=['start_time', 'end_time', 'speaker', 'text'])],
	title="Offline Speaker Diarization with NeMo",
	description="This demonstration will perform offline speaker diarization on an audio file using nemo",
	article=article,
	layout="vertical",
	theme="huggingface",
	allow_flagging=False,
	live=False,
	examples=examples,
	)

	demo = gr.TabbedInterface([microphone_interface, upload_interface], ["Microphone", "Upload File"])

	demo.launch(enable_queue=True)