speech-to-speech-translation

Sleeping

File size: 2,956 Bytes

0a5f7db
 
 
37cb8cc
 
 
 
 
 
 
 
 
0a5f7db
 
 
37cb8cc
0a5f7db
37cb8cc
 
 
 
0a5f7db
 
 
 
37cb8cc
 
 
 
 
 
 
 
0a5f7db
37cb8cc
 
 
 
 
 
 
 
 
 
0a5f7db
37cb8cc
 
0a5f7db
 
 
 
 
 
37cb8cc
 
0a5f7db
 
 
37cb8cc
0a5f7db
 
 
 
37cb8cc
 
0a5f7db
 
 
37cb8cc
0a5f7db
 
 
 
 
37cb8cc

import gradio as gr
import numpy as np
import torch
from transformers import AutoProcessor, pipeline, BarkModel

ASR_MODEL_NAME = "bofenghuang/whisper-large-v2-cv11-german"
TTS_MODEL_NAME = "suno/bark-small"
BATCH_SIZE = 8
voices = {
"male" : "v2/en_speaker_6",
"female" : "v2/en_speaker_9"
}
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=10,device=device)

# load text-to-speech checkpoint
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = BarkModel.from_pretrained("suno/bark-small").to(device)
sampling_rate = model.generation_config.sample_rate

def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]
    
def synthesise(text, voice_preset):
    inputs = processor(text=text, return_tensors="pt",voice_preset=voice_preset)
    speech = model.generate(**inputs.to(device))
    return speech[0]
    
def speech_to_speech_translation(audio, voice):
    voice_preset = None
    translated_text = translate(audio)
    print(translated_text)
    if voice == "Female":
     voice_preset = voices["female"]
    else:
     voice_preset = voices["male"]
    synthesised_speech = synthesise(translated_text, voice_preset)
    synthesised_speech = (synthesised_speech.cpu().numpy() * 32767).astype(np.int16)
    return sampling_rate, synthesised_speech
    
title = "Cascaded STST - Any language to German speech"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in German. Demo uses fine-tuned version of openai/whisper-large-v2 model (https://huggingface.co/bofenghuang/whisper-large-v2-cv11-german) for speech translation, and Suno's
[Bark-large](https://huggingface.co/suno/bark-small) model for text-to-speech:
![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""
demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=[gr.Audio(source="microphone", type="filepath"),
    gr.inputs.Radio(["Male", "Female"], label="Voice", default="Male")],
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
    allow_flagging="never"
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=[gr.Audio(source="upload", type="filepath"),
    gr.inputs.Radio(["Male", "Female"], label="Voice", default="Male")],
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
    allow_flagging="never"
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.queue(concurrency_count=2,max_size=10)
demo.launch()