Spaces:
Sleeping
Sleeping
import torch | |
from threading import Thread | |
from transformers import AutoProcessor | |
from transformers import set_seed | |
from utils.vocos_bark import BarkModel | |
from scipy.io.wavfile import write | |
from pydub import AudioSegment | |
import numpy as np | |
import os | |
import gradio as gr | |
import uuid | |
import io | |
set_seed(0) | |
def _grab_best_device(use_gpu=True): | |
if torch.cuda.device_count() > 0 and use_gpu: | |
device = "cuda" | |
else: | |
device = "cpu" | |
return device | |
device = _grab_best_device() | |
HUB_PATH = "suno/bark" | |
processor = AutoProcessor.from_pretrained(HUB_PATH) | |
speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key]) | |
SAMPLE_RATE = 24_000 | |
# import model | |
if device == "cpu": | |
bark = BarkModel.from_pretrained(HUB_PATH) | |
else: | |
bark = BarkModel.from_pretrained(HUB_PATH).to(device) | |
bark = bark.to_bettertransformer() | |
# streaming inference | |
def generate_audio(text, voice_preset = None, lag = 0): | |
if voice_preset not in speaker_embeddings: | |
voice_preset = None | |
sentences = [ | |
text, | |
] | |
inputs = processor(sentences, voice_preset=voice_preset).to(device) | |
# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. | |
waveform = bark.generate( | |
**inputs, coarse_temperature = 0.8, semantic_temperature = 0.5 | |
) | |
return (SAMPLE_RATE, waveform.squeeze().cpu().numpy()) | |
# Gradio blocks demo | |
with gr.Blocks() as demo_blocks: | |
gr.Markdown("""<h1 align="center">🐶BARK with Vocos</h1>""") | |
gr.HTML("""<h3 style="text-align:center;">📢Audio Streaming powered by Gradio 🦾! </h3>""") | |
with gr.Group(): | |
with gr.Row(): | |
inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here") | |
dd = gr.Dropdown( | |
speaker_embeddings, | |
value=None, | |
label="Available voice presets", | |
info="Defaults to no speaker embeddings!" | |
) | |
with gr.Row(): | |
btn = gr.Button("Bark with Vocos TTS") | |
with gr.Row(): | |
out_audio = gr.Audio(type="numpy", autoplay=True) | |
btn.click(generate_audio, [inp_text, dd], out_audio) | |
demo_blocks.queue().launch(debug=True) |