Spaces:
Running
Running
import nemo.collections.asr as nemo_asr | |
from pydub import AudioSegment | |
from io import BytesIO | |
import tempfile | |
import os | |
import gradio as gr | |
def convert_to_mono(input_file): | |
# Load the audio file (supports various formats) | |
sound = AudioSegment.from_file(input_file) | |
# Convert to mono | |
sound = sound.set_channels(1) | |
# Export the mono audio file into a BytesIO object | |
converted = BytesIO() | |
sound.export(converted, format="wav") | |
converted.seek(0) # Move the pointer to the start of the BytesIO object | |
return converted | |
# Load the pre-trained ASR model | |
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained( | |
model_name="nvidia/stt_ka_fastconformer_hybrid_large_pc" | |
) | |
def transcribe_audio(audio_file): | |
if audio_file: | |
# Convert the uploaded audio to mono | |
mono_audio = convert_to_mono(audio_file) | |
# Write the mono audio to a temporary file and close it before transcribing | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: | |
temp_file.write(mono_audio.read()) | |
temp_file_path = temp_file.name | |
# Transcribe the audio using the temporary file path | |
res = asr_model.transcribe([temp_file_path]) | |
# Clean up the temporary file | |
os.remove(temp_file_path) | |
# Return the transcription result | |
return res[0][0] | |
# def transcribe_audio(audio_file): | |
# if audio_file: | |
# # Convert the uploaded audio to mono | |
# mono_audio = convert_to_mono(audio_file) | |
# # Transcribe the audio using the BytesIO object directly | |
# audio_data = mono_audio.read() | |
# # Use the audio_data in the format expected by the ASR model | |
# res = asr_model.transcribe([BytesIO(audio_data)]) | |
# # Return the transcription result | |
# return res[0][0] | |
# Create the Gradio interface | |
interface = gr.Interface( | |
fn=transcribe_audio, | |
inputs=gr.Audio(type="filepath"), # Allow audio file uploads and get the file path | |
outputs="text", # Display the transcription result as text | |
title="ASR Transcription", | |
description="Upload an audio file (mp3, wav, or m4a) and get the transcription." | |
) | |
# Launch the Gradio interface | |
interface.launch() | |