import os
import gradio as gr
import torch
from diffusers import StableAudioPipeline
import spaces
from translatepy import Translator
import numpy as np
import random
import soundfile as sf
translator = Translator()
# Constants
model = "stabilityai/stable-audio-open-1.0"
MAX_SEED = np.iinfo(np.int32).max
CSS = """
.gradio-container {
max-width: 690px !important;
}
footer {
visibility: hidden;
}
"""
JS = """function () {
gradioURL = window.location.href
if (!gradioURL.endsWith('?__theme=dark')) {
window.location.replace(gradioURL + '?__theme=dark');
}
}"""
DESCRIPTION = """
Stable Audio Open 1.0 generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. \
It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, \
a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.
"""
# Ensure model and scheduler are initialized in GPU-enabled function
if torch.cuda.is_available():
pipe = StableAudioPipeline.from_pretrained(
model,
torch_dtype=torch.float16)
pipe = pipe.to("cuda")
# Function
@spaces.GPU(duration=120)
def main(
prompt,
negative="low quality",
second: float = 10.0,
seed: int = -1):
if seed == -1:
seed = random.randint(0, MAX_SEED)
seed = int(seed)
generator = torch.Generator().manual_seed(seed)
prompt = str(translator.translate(prompt, 'English'))
print(f'prompt:{prompt}')
audio = pipe(
prompt,
negative_prompt=negative,
audio_end_in_s=second,
num_inference_steps=200,
num_waveforms_per_prompt=3,
generator=generator,
).audios
os.makedirs("outputs", exist_ok=True)
base_count = len(glob(os.path.join("outputs", "*.mp4")))
audio_path = os.path.join("outputs", f"{base_count:06d}.wav")
sf.write(audio_path, audio[0].T.float().cpu().numpy(), pipe.vae.samping_rate)
return audio_path, seed
# Gradio Interface
with gr.Blocks(theme='soft', css=CSS, js=JS, title="Stable Audio Open") as iface:
with gr.Accordion(""):
gr.Markdown(DESCRIPTION)
output = gr.Audio(label="Podcast", type="filepath", interactive=False, autoplay=True, elem_classes="audio") # Create an output textbox
prompt = gr.Textbox(label="Prompt", placeholder="1000 BPM percussive sound of water drops")
negative = gr.Textbox(label="Negative prompt", placeholder="Low quality")
with gr.Row():
second = gr.Slider(5.0, 60.0, value=10.0, label="Second", step=0.1),
seed = gr.Slider(-1, MAX_SEED, value=-1, label="Seed", step=1),
with gr.Row():
submit_btn = gr.Button("🚀 Send") # Create a submit button
clear_btn = gr.ClearButton([prompt, seed, output], value="🗑️ Clear") # Create a clear button
# Set up the event listeners
submit_btn.click(main, inputs=[prompt, negative, second, seed], outputs=[output, seed])
#gr.close_all()
iface.queue().launch(show_api=False) # Launch the Gradio interface