speech-to-speech-translation

Runtime error

App Files Files Community

speech-to-speech-translation / app.py

davidggphy

Adapt to Whisper (es) + Bark (es)

245bced over 1 year ago

raw

history blame

3.08 kB

	import gradio as gr
	import numpy as np
	import torch
	from transformers import BarkModel
	from transformers import AutoProcessor
	from transformers import pipeline
	import librosa

	processor = AutoProcessor.from_pretrained("suno/bark-small")
	model = BarkModel.from_pretrained("suno/bark-small")


	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	model = model.to(device)


	# https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
	language_presets = {"es":"v2/es_speaker_",
	"en":"v2/en_speaker_"}
	def tts(text, language="es", style:int = 0):
	voice_preset = language_presets[language] + str(style)
	# prepare the inputs
	inputs = processor(text, voice_preset = voice_preset)
	# generate speech
	speech_output = model.generate(**inputs.to(device))
	sampling_rate = model.generation_config.sample_rate
	return speech_output[0].cpu().numpy(), sampling_rate


	# load speech translation checkpoint
	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)


	def translate(audio, language:str = "es"):
	outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language":language})
	text = outputs["text"]

	return text


	def synthesise(text, language="es",style=0):
	speech, sr = tts(text, language=language, style=style)
	target_sr = 16_000
	speech = librosa.resample(speech, orig_sr = sr, target_sr = target_sr)
	return speech, target_sr


	def speech_to_speech_translation(audio, debug = True):
	translated_text = translate(audio)
	if debug:
	print(f"{translated_text=}")
	synthesised_speech, sampling_rate = synthesise(translated_text)
	# tranform to int for Gradio
	synthesised_speech = (np.array(synthesised_speech) * 32767).astype(np.int16)
	return sampling_rate, synthesised_speech


	title = "Cascaded STST"
	description = """
	Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
	[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:

	![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
	"""

	demo = gr.Blocks()

	mic_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title=title,
	description=description,
	)

	file_translate = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(source="upload", type="filepath"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	examples=[["./example.wav"]],
	title=title,
	description=description,
	)

	with demo:
	gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

	demo.launch()