Spaces:

revaza
/

speech-2-text-ka

Running

App Files Files Community

speech-2-text-ka / app.py

revaza

Update app.py

6a0d521 verified 6 months ago

raw

history blame contribute delete

2.28 kB

	import nemo.collections.asr as nemo_asr
	from pydub import AudioSegment
	from io import BytesIO
	import tempfile
	import os
	import gradio as gr

	def convert_to_mono(input_file):
	# Load the audio file (supports various formats)
	sound = AudioSegment.from_file(input_file)

	# Convert to mono
	sound = sound.set_channels(1)

	# Export the mono audio file into a BytesIO object
	converted = BytesIO()
	sound.export(converted, format="wav")
	converted.seek(0) # Move the pointer to the start of the BytesIO object
	return converted


	# Load the pre-trained ASR model
	asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
	model_name="nvidia/stt_ka_fastconformer_hybrid_large_pc"
	)


	def transcribe_audio(audio_file):
	if audio_file:
	# Convert the uploaded audio to mono
	mono_audio = convert_to_mono(audio_file)

	# Write the mono audio to a temporary file and close it before transcribing
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
	temp_file.write(mono_audio.read())
	temp_file_path = temp_file.name

	# Transcribe the audio using the temporary file path
	res = asr_model.transcribe([temp_file_path])

	# Clean up the temporary file
	os.remove(temp_file_path)

	# Return the transcription result
	return res[0][0]
	# def transcribe_audio(audio_file):
	# if audio_file:
	# # Convert the uploaded audio to mono
	# mono_audio = convert_to_mono(audio_file)

	# # Transcribe the audio using the BytesIO object directly
	# audio_data = mono_audio.read()

	# # Use the audio_data in the format expected by the ASR model
	# res = asr_model.transcribe([BytesIO(audio_data)])

	# # Return the transcription result
	# return res[0][0]


	# Create the Gradio interface
	interface = gr.Interface(
	fn=transcribe_audio,
	inputs=gr.Audio(type="filepath"), # Allow audio file uploads and get the file path
	outputs="text", # Display the transcription result as text
	title="ASR Transcription",
	description="Upload an audio file (mp3, wav, or m4a) and get the transcription."
	)

	# Launch the Gradio interface
	interface.launch()