speech-to-speech-translation

Sleeping

App Files Files Community

speech-to-speech-translation / app.py

preetam8

Fix commenting and gradio version

d386a01 3 months ago

raw

history blame

3.1 kB

	import gradio as gr
	import logging
	import numpy as np
	import torch

	from transformers import VitsModel, VitsTokenizer, pipeline
	from transformers import M2M100ForConditionalGeneration
	from tokenization_small100 import SMALL100Tokenizer


	device = "cuda:0" if torch.cuda.is_available() else "cpu"

	target_language = "fr"

	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-small-cv11-french", device=device)
	translation_model = M2M100ForConditionalGeneration.from_pretrained("alirezamsh/small100")
	translation_tokenizer = SMALL100Tokenizer.from_pretrained("alirezamsh/small100", tgt_lang=target_language)

	# load text-to-speech checkpoint
	model = VitsModel.from_pretrained("facebook/mms-tts-fra")
	tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


	def translate(audio):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
	eng_text = outputs["text"]
	encoded_eng_text = translation_tokenizer(eng_text, return_tensors="pt")
	generated_tokens = translation_model.generate(**encoded_eng_text)
	translated_text = translation_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
	logging.info(f"Translated Text: {translated_text}")
	return translated_text


	def synthesise(text):
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	outputs = model(inputs["input_ids"])
	speech = outputs["waveform"][0]
	logging.info(speech)
	return speech.cpu()


	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
	return 16000, synthesised_speech


	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for ASR, the
	[SMaLL-100](https://huggingface.co/alirezamsh/small100) model for text to text translation and Facebook's[MMS TTS-FRA](https://huggingface.co/facebook/mms-tts-fra) for text-to-speech for french:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	logging.getLogger().setLevel(logging.INFO)
	demo.launch()