Spaces:
Running
Running
File size: 2,284 Bytes
36f8298 704f9ea 6a0d521 704f9ea 6a0d521 704f9ea 6a0d521 704f9ea 6a0d521 36f8298 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import nemo.collections.asr as nemo_asr
from pydub import AudioSegment
from io import BytesIO
import tempfile
import os
import gradio as gr
def convert_to_mono(input_file):
# Load the audio file (supports various formats)
sound = AudioSegment.from_file(input_file)
# Convert to mono
sound = sound.set_channels(1)
# Export the mono audio file into a BytesIO object
converted = BytesIO()
sound.export(converted, format="wav")
converted.seek(0) # Move the pointer to the start of the BytesIO object
return converted
# Load the pre-trained ASR model
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
model_name="nvidia/stt_ka_fastconformer_hybrid_large_pc"
)
def transcribe_audio(audio_file):
if audio_file:
# Convert the uploaded audio to mono
mono_audio = convert_to_mono(audio_file)
# Write the mono audio to a temporary file and close it before transcribing
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
temp_file.write(mono_audio.read())
temp_file_path = temp_file.name
# Transcribe the audio using the temporary file path
res = asr_model.transcribe([temp_file_path])
# Clean up the temporary file
os.remove(temp_file_path)
# Return the transcription result
return res[0][0]
# def transcribe_audio(audio_file):
# if audio_file:
# # Convert the uploaded audio to mono
# mono_audio = convert_to_mono(audio_file)
# # Transcribe the audio using the BytesIO object directly
# audio_data = mono_audio.read()
# # Use the audio_data in the format expected by the ASR model
# res = asr_model.transcribe([BytesIO(audio_data)])
# # Return the transcription result
# return res[0][0]
# Create the Gradio interface
interface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"), # Allow audio file uploads and get the file path
outputs="text", # Display the transcription result as text
title="ASR Transcription",
description="Upload an audio file (mp3, wav, or m4a) and get the transcription."
)
# Launch the Gradio interface
interface.launch()
|