import gradio as gr from transformers import pipeline, Wav2Vec2ProcessorWithLM from pyannote.audio import Pipeline from librosa import load, resample from rpunct import RestorePuncts asr_model = 'patrickvonplaten/wav2vec2-base-960h-4-gram' processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model) asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder) speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation") rpunct = RestorePuncts() def transcribe(filepath): speaker_output = speaker_segmentation(filepath) speech, sampling_rate = load(filepath) if sampling_rate != 16000: speech = resample(speech, sampling_rate, 16000) text = asr(speech, return_timestamps="word") full_text = text['text'].lower() chunks = text['chunks'] diarized_output = "" i = 0 for turn, _, speaker in speaker_output.itertracks(yield_label=True): diarized = "" while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end: diarized += chunks[i]['text'].lower() + ' ' i += 1 if diarized != "": diarized = rpunct.punctuate(diarized) diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end) return diarized_output, full_text mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False) diarized_transcript = gr.outputs.Textbox(type='auto', label='Diarized Output') full_transcript = gr.outputs.Textbox(type='auto', label='Full Transcript') examples = [["meeting_audio.wav"]] iface = gr.Interface( theme='huggingface', description='Testing transcription', fn=transcribe, inputs=[mic], outputs=[diarized_transcript, full_transcript], examples=examples ) iface.launch()