File size: 3,813 Bytes
c675805
9f58137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b28f04
9f58137
b52d0fa
2ed8747
 
 
 
 
 
3795637
2ed8747
b52d0fa
2ed8747
 
9f58137
 
 
 
 
 
 
 
0b28f04
9f58137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b28f04
9f58137
 
 
 
2ed8747
 
3795637
 
 
 
 
 
 
 
 
 
9f58137
3795637
 
 
 
9f58137
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import torch
from threading import Thread

from transformers import AutoProcessor
from transformers import set_seed

from vocos_bark import BarkModel
import numpy as np

import gradio as gr
from vocos import Vocos

set_seed(0)

def _grab_best_device(use_gpu=True):
    if torch.cuda.device_count() > 0 and use_gpu:
        device = "cuda"
    else:
        device = "cpu"
    return device

device = _grab_best_device()

HUB_PATH = "suno/bark"

processor = AutoProcessor.from_pretrained(HUB_PATH)

speaker_embeddings = sorted([key for key in processor.speaker_embeddings.keys() if "speaker" in key])

SAMPLE_RATE = 24_000

vocos = Vocos.from_pretrained("charactr/vocos-encodec-24khz").to(device)

title = "# 🐶 Bark with <a href='https://github.com/charactr-platform/vocos'>Vocos</a></div>"

description = """
<div>
<a style="display:inline-block" href='https://github.com/charactr-platform/vocos'><img src='https://img.shields.io/github/stars/charactr-platform/vocos?style=social' /></a>
<a style="display:inline-block; margin-left: 1em" href="https://huggingface.co/spaces/hf-audio/vocos-bark?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space%20to%20skip%20the%20queue-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a>
</div>
Bark is a universal text-to-audio model created by Suno. \
Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \
In this demo, we leverage charactr's Vocos model to create high quality audio from Bark. \
"""

# import model
if device == "cpu":
    bark = BarkModel.from_pretrained(HUB_PATH)
else:
    bark = BarkModel.from_pretrained(HUB_PATH).to(device)
    bark = bark.to_bettertransformer()


# Inference
def generate_audio(text, voice_preset = None, lag = 0):
    if voice_preset not in speaker_embeddings:
        voice_preset = None
    
    sentences = [
        text,
    ]
    inputs = processor(sentences, voice_preset=voice_preset).to(device)
    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.

    fine_output = bark.generate(
        **inputs, coarse_temperature = 0.8, temperature = 0.5, do_sample=True
    )
    
    print("Fine tokens generated")
    
    with torch.no_grad():
        features = vocos.codes_to_features(fine_output.transpose(0,1))
        vocos_waveform = vocos.decode(features, bandwidth_id=torch.tensor([2], device=device))  
        
    return (SAMPLE_RATE, vocos_waveform.cpu().squeeze().numpy())


# Gradio blocks demo    
with gr.Blocks() as demo_blocks:
    gr.Markdown(title)
    gr.Markdown(description)
    with gr.Row():
        with gr.Column():
            inp_text = gr.Textbox(label="Input Text", info="What would you like bark to synthesise?")
            spk = gr.Dropdown(
                    speaker_embeddings,
                    value=None, 
                    label="Acoustic Prompt", 
                    info="Default: Unconditional Generation"
                    )
            btn = gr.Button("Generate Audio!")
    
        with gr.Column():
            out_audio_vocos = gr.Audio(type="numpy", autoplay=False, label="Generated Audio", show_label=True)
    
    btn.click(generate_audio, [inp_text, spk], [out_audio_vocos])

demo_blocks.queue().launch(debug=True)