Spaces:

mboushaba
/

transcribe-translate-ph-lang

Sleeping

App Files Files Community

transcribe-translate-ph-lang / app.py

mboushaba

Update app.py

8efadfe verified 3 months ago

raw

history blame contribute delete

3.37 kB

	import gradio as gr
	import librosa
	import numpy as np
	import soundfile as sf
	from transformers import pipeline
	import os
	from huggingface_hub import login

	# Retrieve token from environment variable
	hugging_face_token = os.getenv("ASR_CEB_HUGGING_FACE_TOKEN")

	# Login using the token
	login(token=hugging_face_token)

	asr_ceb = pipeline("automatic-speech-recognition", model = "sil-ai/wav2vec2-bloom-speech-ceb")
	asr_whisper_large = pipeline("automatic-speech-recognition", model = "openai/whisper-large-v3")
	asr_whisper_ceb = pipeline("automatic-speech-recognition",
	model = "nlewins/whisper-small-translate-X-gen2-examples-quality-step4-1e-6")


	def transcribe_speech(filepath):
	if filepath is None:
	gr.Warning("No audio found, please retry.")
	return ""
	_, sample_rate = librosa.load(filepath, sr = None)
	model_rate = asr_ceb.feature_extractor.sampling_rate
	if sample_rate != model_rate:
	filepath = resample_audio_for_processing(filepath, model_rate, sample_rate)

	output_ceb = asr_ceb(filepath)
	generate_kwargs = {
	# "language": "tagalog",#source language
	"task": "translate"
	}
	output_whisper_large_translate = asr_whisper_large(filepath, generate_kwargs = generate_kwargs)
	output_whisper_large = asr_whisper_large(filepath)
	output_whisper_ceb = asr_whisper_ceb(filepath)
	return (output_ceb["text"], output_whisper_large["text"], output_whisper_large_translate["text"],
	output_whisper_ceb["text"])


	def resample_audio_for_processing(filepath, model_rate, sample_rate):
	print(f"Audio loaded with rate: {sample_rate} Hz while model requires rate: {model_rate} Hz")
	try:
	print("Resampling audio...")
	audio_data, sr = librosa.load(filepath, sr = None) # Audio data will be a NumPy array
	# Ensure that audio_data is a NumPy array
	audio_data = np.array(audio_data)
	# Resample to 16kHz
	audio_resampled = librosa.resample(audio_data, orig_sr = sample_rate, target_sr = model_rate)
	# Save the resampled audio
	resampled_audio_path = 'resampled_audio.wav'
	sf.write(resampled_audio_path, audio_resampled, 16000)
	print("Audio resampled successfully.")
	return resampled_audio_path
	except Exception as e:
	print(f"Error resampling audio: {e}, processing with audio as is it !")
	return filepath


	mic_transcribe = gr.Interface(
	fn = transcribe_speech,
	inputs = gr.Audio(sources = ["microphone"], type = "filepath"),
	outputs = [gr.Textbox(label = "Transcription CEB (sil-ai)"), gr.Textbox(label = "Transcription (openai)"),
	gr.Textbox(label = "Translation (openai)"),
	gr.Textbox(label = "Transcription (nlewins)")]
	, allow_flagging = "never")

	file_transcribe = gr.Interface(
	fn = transcribe_speech,
	inputs = gr.Audio(sources = ["upload"], type = "filepath"),
	outputs = [gr.Textbox(label = "Transcription CEB (sil-ai)"), gr.Textbox(label = "Transcription (openai)"),
	gr.Textbox(label = "Translation (openai)"),
	gr.Textbox(label = "Translation (nlewins)")]
	, allow_flagging = "never",
	)

	demo = gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Use your Microphone", "Upload Audio File"],
	)

	if __name__ == '__main__':
	demo.launch()