Spaces:

mboushaba
/

transcribe-translate-ph-lang

Sleeping

App Files Files Community

transcribe-translate-ph-lang / app.py

mboushaba

Update app.py

8efadfe verified 4 months ago

raw

history blame

3.37 kB

	import gradio as gr
	import librosa
	import numpy as np
	import soundfile as sf
	from transformers import pipeline
	import os
	from huggingface_hub import login

	# Retrieve token from environment variable
	hugging_face_token = os.getenv("ASR_CEB_HUGGING_FACE_TOKEN")

	# Login using the token
	login(token=hugging_face_token)

	asr_ceb = pipeline("automatic-speech-recognition", model = "sil-ai/wav2vec2-bloom-speech-ceb")
	asr_whisper_large = pipeline("automatic-speech-recognition", model = "openai/whisper-large-v3")
	asr_whisper_ceb = pipeline("automatic-speech-recognition",
	model = "nlewins/whisper-small-translate-X-gen2-examples-quality-step4-1e-6")


	def transcribe_speech(filepath):
	if filepath is None:
	gr.Warning("No audio found, please retry.")
	return ""
	_, sample_rate = librosa.load(filepath, sr = None)
	model_rate = asr_ceb.feature_extractor.sampling_rate
	if sample_rate != model_rate:
	filepath = resample_audio_for_processing(filepath, model_rate, sample_rate)

	output_ceb = asr_ceb(filepath)
	generate_kwargs = {
	# "language": "tagalog",#source language
	"task": "translate"
	}
	output_whisper_large_translate = asr_whisper_large(filepath, generate_kwargs = generate_kwargs)
	output_whisper_large = asr_whisper_large(filepath)
	output_whisper_ceb = asr_whisper_ceb(filepath)
	return (output_ceb["text"], output_whisper_large["text"], output_whisper_large_translate["text"],
	output_whisper_ceb["text"])


	def resample_audio_for_processing(filepath, model_rate, sample_rate):
	print(f"Audio loaded with rate: {sample_rate} Hz while model requires rate: {model_rate} Hz")
	try:
	print("Resampling audio...")
	audio_data, sr = librosa.load(filepath, sr = None) # Audio data will be a NumPy array
	# Ensure that audio_data is a NumPy array
	audio_data = np.array(audio_data)
	# Resample to 16kHz
	audio_resampled = librosa.resample(audio_data, orig_sr = sample_rate, target_sr = model_rate)
	# Save the resampled audio
	resampled_audio_path = 'resampled_audio.wav'
	sf.write(resampled_audio_path, audio_resampled, 16000)
	print("Audio resampled successfully.")
	return resampled_audio_path
	except Exception as e:
	print(f"Error resampling audio: {e}, processing with audio as is it !")
	return filepath


	mic_transcribe = gr.Interface(
	fn = transcribe_speech,
	inputs = gr.Audio(sources = ["microphone"], type = "filepath"),
	outputs = [gr.Textbox(label = "Transcription CEB (sil-ai)"), gr.Textbox(label = "Transcription (openai)"),
	gr.Textbox(label = "Translation (openai)"),
	gr.Textbox(label = "Transcription (nlewins)")]
	, allow_flagging = "never")

	file_transcribe = gr.Interface(
	fn = transcribe_speech,
	inputs = gr.Audio(sources = ["upload"], type = "filepath"),
	outputs = [gr.Textbox(label = "Transcription CEB (sil-ai)"), gr.Textbox(label = "Transcription (openai)"),
	gr.Textbox(label = "Translation (openai)"),
	gr.Textbox(label = "Translation (nlewins)")]
	, allow_flagging = "never",
	)

	demo = gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Use your Microphone", "Upload Audio File"],
	)

	if __name__ == '__main__':
	demo.launch()