import torch from threading import Thread from transformers import AutoProcessor from transformers import set_seed from utils.vocos_bark import BarkModel from scipy.io.wavfile import write from pydub import AudioSegment import numpy as np import os import gradio as gr import uuid import io set_seed(0) def _grab_best_device(use_gpu=True): if torch.cuda.device_count() > 0 and use_gpu: device = "cuda" else: device = "cpu" return device device = _grab_best_device() HUB_PATH = "suno/bark" processor = AutoProcessor.from_pretrained(HUB_PATH) speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key]) SAMPLE_RATE = 24_000 # import model if device == "cpu": bark = BarkModel.from_pretrained(HUB_PATH) else: bark = BarkModel.from_pretrained(HUB_PATH).to(device) bark = bark.to_bettertransformer() # streaming inference def generate_audio(text, voice_preset = None, lag = 0): if voice_preset not in speaker_embeddings: voice_preset = None sentences = [ text, ] inputs = processor(sentences, voice_preset=voice_preset).to(device) # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. waveform = bark.generate( **inputs, coarse_temperature = 0.8, semantic_temperature = 0.5 ) return (SAMPLE_RATE, waveform.squeeze().cpu().numpy()) # Gradio blocks demo with gr.Blocks() as demo_blocks: gr.Markdown("""

🐶BARK with Vocos

""") gr.HTML("""

📢Audio Streaming powered by Gradio 🦾!

""") with gr.Group(): with gr.Row(): inp_text = gr.Textbox(label="What should Bark say?", info="Enter text here") dd = gr.Dropdown( speaker_embeddings, value=None, label="Available voice presets", info="Defaults to no speaker embeddings!" ) with gr.Row(): btn = gr.Button("Bark with Vocos TTS") with gr.Row(): out_audio = gr.Audio(type="numpy", autoplay=True) btn.click(generate_audio, [inp_text, dd], out_audio) demo_blocks.queue().launch(debug=True)