speech-to-speech-translation-test

Sleeping

App Files Files Community

speech-to-speech-translation-test / app.py

juangtzi

Update app.py

b7cd514 verified 4 months ago

raw

history blame

2.86 kB

	import gradio as gr
	import numpy as np
	import torch
	from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer
	from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor

	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2", device=device)

	#vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
	#vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")


	model = SpeechT5ForTextToSpeech.from_pretrained(
	"juangtzi/speecht5_finetuned_voxpopuli_es"
	)
	checkpoint = "microsoft/speecht5_tts"
	processor = SpeechT5Processor.from_pretrained(checkpoint)
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	speaker_embeddings2 = np.load('speaker_embeddings.npy')
	speaker_embeddings2 = torch.tensor(speaker_embeddings2)
	print(speaker_embeddings2)
	lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

	def language_detector(text):
	resultado = lang_detector(text)
	idioma_detectado = resultado[0]['label']
	print(idioma_detectado)
	return idioma_detectado

	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "es"})
	return outputs["text"]

	def synthesise(text):
	inputs = processor(text=text, return_tensors="pt")
	output = model.generate_speech(inputs["input_ids"], speaker_embeddings2, vocoder=vocoder)
	return output

	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	audio_data = synthesised_speech.cpu().numpy()
	audio_data = np.squeeze(audio_data)
	audio_data = audio_data / np.max(np.abs(audio_data))
	sample_rate = 16000
	return (sample_rate, audio_data)

	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish.

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(sources="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(sources="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()