import torch from threading import Thread from transformers import AutoProcessor from transformers import set_seed from utils.vocos_bark import BarkModel from scipy.io.wavfile import write from pydub import AudioSegment import numpy as np import os import gradio as gr import uuid import io set_seed(0) def _grab_best_device(use_gpu=True): if torch.cuda.device_count() > 0 and use_gpu: device = "cuda" else: device = "cpu" return device device = _grab_best_device() HUB_PATH = "suno/bark" processor = AutoProcessor.from_pretrained(HUB_PATH) speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key]) SAMPLE_RATE = 24_000 # import model if device == "cpu": bark = BarkModel.from_pretrained(HUB_PATH) else: bark = BarkModel.from_pretrained(HUB_PATH).to(device) bark = bark.to_bettertransformer() # streaming inference def generate_audio(text, voice_preset = None, lag = 0): if voice_preset not in speaker_embeddings: voice_preset = None sentences = [ text, ] inputs = processor(sentences, voice_preset=voice_preset).to(device) # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. waveform = bark.generate( **inputs, coarse_temperature = 0.8, semantic_temperature = 0.5 ) return (SAMPLE_RATE, waveform.squeeze().cpu().numpy()) # Gradio blocks demo with gr.Blocks() as demo_blocks: gr.Markdown("""