import gradio as gr import torch import librosa import soundfile import nemo.collections.asr as nemo_asr import tempfile import os import uuid SAMPLE_RATE = 16000 model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("stt_en_conformer_transducer_large") model.change_decoding_strategy(None) model.eval() def process_audio_file(file): data, sr = librosa.load(file) if sr != SAMPLE_RATE: data = librosa.resample(data, sr, SAMPLE_RATE) # monochannel data = librosa.to_mono(data) return data def transcribe(file_mic, file_upload): warn_output = "" if (file_mic is not None) and (file_upload is not None): warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" file = file_mic elif (file_mic is None) and (file_upload is None): return "ERROR: You have to either use the microphone or upload an audio file" elif file_mic is not None: file = file_mic else: file = file_upload audio_data = process_audio_file(file) with tempfile.TemporaryDirectory() as tmpdir: audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav') soundfile.write(audio_path, audio_data, SAMPLE_RATE) transcriptions = model.transcribe([audio_path]) # if transcriptions form a tuple (from RNNT), extract just "best" hypothesis if type(transcriptions) == tuple and len(transcriptions) == 2: transcriptions = transcriptions[0] return warn_output + transcriptions[0] iface = gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type='filepath', optional=True), gr.inputs.Audio(source="upload", type='filepath', optional=True), ], outputs="text", layout="horizontal", theme="huggingface", title="NeMo Conformer Transducer Large", description="Demo for speech recognition using Conformers", enable_queue=True, allow_flagging=False, ) iface.launch()