davidggphy's picture
Adapt to Whisper (es) + Bark (es)
245bced
raw
history blame
3.08 kB
import gradio as gr
import numpy as np
import torch
from transformers import BarkModel
from transformers import AutoProcessor
from transformers import pipeline
import librosa
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
# https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
language_presets = {"es":"v2/es_speaker_",
"en":"v2/en_speaker_"}
def tts(text, language="es", style:int = 0):
voice_preset = language_presets[language] + str(style)
# prepare the inputs
inputs = processor(text, voice_preset = voice_preset)
# generate speech
speech_output = model.generate(**inputs.to(device))
sampling_rate = model.generation_config.sample_rate
return speech_output[0].cpu().numpy(), sampling_rate
# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
def translate(audio, language:str = "es"):
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language":language})
text = outputs["text"]
return text
def synthesise(text, language="es",style=0):
speech, sr = tts(text, language=language, style=style)
target_sr = 16_000
speech = librosa.resample(speech, orig_sr = sr, target_sr = target_sr)
return speech, target_sr
def speech_to_speech_translation(audio, debug = True):
translated_text = translate(audio)
if debug:
print(f"{translated_text=}")
synthesised_speech, sampling_rate = synthesise(translated_text)
# tranform to int for Gradio
synthesised_speech = (np.array(synthesised_speech) * 32767).astype(np.int16)
return sampling_rate, synthesised_speech
title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
[SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()
mic_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
title=title,
description=description,
)
file_translate = gr.Interface(
fn=speech_to_speech_translation,
inputs=gr.Audio(source="upload", type="filepath"),
outputs=gr.Audio(label="Generated Speech", type="numpy"),
examples=[["./example.wav"]],
title=title,
description=description,
)
with demo:
gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
demo.launch()