import gradio as gr | |
from transformers import AutoProcessor, BarkModel | |
import scipy | |
processor = AutoProcessor.from_pretrained("suno/bark-small") | |
model = BarkModel.from_pretrained("suno/bark-small") | |
model = model.to_bettertransformer() | |
def greet(text): | |
inputs = processor( | |
text=[text], | |
return_tensors="pt", | |
) | |
speech_values = model.generate(**inputs, do_sample=True, use_flash_attention_2=True) | |
scipy.io.wavfile.write("tmp.wav", rate=24000, data=speech_values.cpu().numpy().squeeze()) | |
return open("tmp.wav", "rb").read() | |
iface = gr.Interface(fn=greet, inputs="text", outputs="audio") | |
iface.launch() |