Spaces:
Runtime error
Runtime error
Commit
·
245bced
1
Parent(s):
3b47781
Adapt to Whisper (es) + Bark (es)
Browse files
app.py
CHANGED
@@ -1,42 +1,58 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import torch
|
4 |
-
from
|
|
|
|
|
|
|
5 |
|
6 |
-
|
|
|
7 |
|
8 |
|
9 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
10 |
|
11 |
-
# load speech translation checkpoint
|
12 |
-
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
|
13 |
|
14 |
-
#
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
-
|
18 |
-
|
19 |
|
20 |
-
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
21 |
-
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
22 |
|
|
|
|
|
|
|
23 |
|
24 |
-
|
25 |
-
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
|
26 |
-
return outputs["text"]
|
27 |
|
28 |
|
29 |
-
def synthesise(text):
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
33 |
|
34 |
|
35 |
-
def speech_to_speech_translation(audio):
|
36 |
translated_text = translate(audio)
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
40 |
|
41 |
|
42 |
title = "Cascaded STST"
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
import torch
|
4 |
+
from transformers import BarkModel
|
5 |
+
from transformers import AutoProcessor
|
6 |
+
from transformers import pipeline
|
7 |
+
import librosa
|
8 |
|
9 |
+
processor = AutoProcessor.from_pretrained("suno/bark-small")
|
10 |
+
model = BarkModel.from_pretrained("suno/bark-small")
|
11 |
|
12 |
|
13 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
14 |
+
model = model.to(device)
|
15 |
|
|
|
|
|
16 |
|
17 |
+
# https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c
|
18 |
+
language_presets = {"es":"v2/es_speaker_",
|
19 |
+
"en":"v2/en_speaker_"}
|
20 |
+
def tts(text, language="es", style:int = 0):
|
21 |
+
voice_preset = language_presets[language] + str(style)
|
22 |
+
# prepare the inputs
|
23 |
+
inputs = processor(text, voice_preset = voice_preset)
|
24 |
+
# generate speech
|
25 |
+
speech_output = model.generate(**inputs.to(device))
|
26 |
+
sampling_rate = model.generation_config.sample_rate
|
27 |
+
return speech_output[0].cpu().numpy(), sampling_rate
|
28 |
+
|
29 |
|
30 |
+
# load speech translation checkpoint
|
31 |
+
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
|
32 |
|
|
|
|
|
33 |
|
34 |
+
def translate(audio, language:str = "es"):
|
35 |
+
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language":language})
|
36 |
+
text = outputs["text"]
|
37 |
|
38 |
+
return text
|
|
|
|
|
39 |
|
40 |
|
41 |
+
def synthesise(text, language="es",style=0):
|
42 |
+
speech, sr = tts(text, language=language, style=style)
|
43 |
+
target_sr = 16_000
|
44 |
+
speech = librosa.resample(speech, orig_sr = sr, target_sr = target_sr)
|
45 |
+
return speech, target_sr
|
46 |
|
47 |
|
48 |
+
def speech_to_speech_translation(audio, debug = True):
|
49 |
translated_text = translate(audio)
|
50 |
+
if debug:
|
51 |
+
print(f"{translated_text=}")
|
52 |
+
synthesised_speech, sampling_rate = synthesise(translated_text)
|
53 |
+
# tranform to int for Gradio
|
54 |
+
synthesised_speech = (np.array(synthesised_speech) * 32767).astype(np.int16)
|
55 |
+
return sampling_rate, synthesised_speech
|
56 |
|
57 |
|
58 |
title = "Cascaded STST"
|