Spaces:

Baghdad99
/

ha-en

Sleeping

App Files Files Community

ha-en / app.py

Baghdad99

Update app.py

ea3653e 11 months ago

raw

history blame

3.7 kB

	import gradio as gr
	from transformers import pipeline, AutoTokenizer
	import numpy as np
	from pydub import AudioSegment
	import librosa

	# Load the pipeline for speech recognition and translation
	pipe = pipeline(
	"automatic-speech-recognition",
	model="Akashpb13/Hausa_xlsr",
	tokenizer="Akashpb13/Hausa_xlsr"
	)
	translator = pipeline("text2text-generation", model="Baghdad99/saad-hausa-text-to-english-text")
	tts = pipeline("text-to-speech", model="Baghdad99/english_voice_tts")

	def translate_speech(audio_input):
	print(f"Type of audio: {type(audio_data_tuple)}, Value of audio: {audio_data_tuple}") # Debug line
	# Check if the input is a tuple (recorded audio) or a string (uploaded file)
	if isinstance(audio_input, tuple):
	# Extract the audio data from the tuple
	sample_rate, audio_data = audio_input
	else:
	# Load the audio file as a floating point time series
	audio_data, sample_rate = librosa.load(audio_input, sr=None)

	# Normalize the audio data to the range [-1, 1]
	audio_data_normalized = audio_data / np.iinfo(audio_data.dtype).max

	# Convert the normalized audio data to float64
	audio_data_float64 = audio_data_normalized.astype(np.float64)

	# Use the speech recognition pipeline to transcribe the audio
	output = pipe(audio_data_float64)

	print(f"Output: {output}") # Print the output to see what it contains

	# Check if the output contains 'text'
	if 'text' in output:
	transcription = output["text"]
	else:
	print("The output does not contain 'text'")
	return

	# Print the transcription
	print(f"Transcription: {transcription}")

	# Use the translation pipeline to translate the transcription
	translated_text = translator(transcription, return_tensors="pt")
	print(f"Translated text: {translated_text}") # Print the translated text to see what it contains

	# Check if the translated text contains 'generated_token_ids'
	if 'generated_token_ids' in translated_text[0]:
	# Decode the tokens into text
	translated_text_str = translator.tokenizer.decode(translated_text[0]['generated_token_ids'])
	else:
	print("The translated text does not contain 'generated_token_ids'")
	return

	# Print the translated text string
	print(f"Translated text string: {translated_text_str}")

	# Use the text-to-speech pipeline to synthesize the translated text
	synthesised_speech = tts(translated_text_str)
	print(f"Synthesised speech: {synthesised_speech}") # Print the synthesised speech to see what it contains

	# Check if the synthesised speech contains 'audio'
	if 'audio' in synthesised_speech:
	synthesised_speech_data = synthesised_speech['audio']
	else:
	print("The synthesised speech does not contain 'audio'")
	return

	# Flatten the audio data
	synthesised_speech_data = synthesised_speech_data.flatten()

	# Print the shape and type of the synthesised speech data
	print(f"Synthesised speech data type: {type(synthesised_speech_data)}, Synthesised speech data shape: {synthesised_speech_data.shape}")

	# Scale the audio data to the range of int16 format
	synthesised_speech = (synthesised_speech_data * 32767).astype(np.int16)

	return 16000, synthesised_speech

	# Define the Gradio interface
	iface = gr.Interface(
	fn=translate_speech,
	inputs=gr.inputs.Audio(source="microphone", type="file"), # Change this line
	outputs=gr.outputs.Audio(type="numpy"),
	title="Hausa to English Translation",
	description="Realtime demo for Hausa to English translation using speech recognition and text-to-speech synthesis."
	)

	iface.launch()