|
import gradio as gr |
|
import torch |
|
from diffusers import StableAudioPipeline |
|
from huggingface_hub import hf_hub_download |
|
import spaces |
|
from translatepy import Translator |
|
import numpy as np |
|
import random |
|
import soundfile as sf |
|
|
|
translator = Translator() |
|
|
|
|
|
model = "stabilityai/stable-audio-open-1.0" |
|
MAX_SEED = np.iinfo(np.int32).max |
|
|
|
CSS = """ |
|
.gradio-container { |
|
max-width: 690px !important; |
|
} |
|
footer { |
|
visibility: hidden; |
|
} |
|
""" |
|
|
|
JS = """function () { |
|
gradioURL = window.location.href |
|
if (!gradioURL.endsWith('?__theme=dark')) { |
|
window.location.replace(gradioURL + '?__theme=dark'); |
|
} |
|
}""" |
|
DESCRIPTION = """ |
|
<center> |
|
Stable Audio Open 1.0 generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. \ |
|
It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, \ |
|
a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder. |
|
</center> |
|
""" |
|
|
|
|
|
if torch.cuda.is_available(): |
|
pipe = StableAudioPipeline.from_pretrained( |
|
model, |
|
torch_dtype=torch.float16) |
|
pipe = pipe.to("cuda") |
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
def main( |
|
prompt, |
|
negative="low quality", |
|
second: float = 10.0): |
|
|
|
if seed == -1: |
|
seed = random.randint(0, MAX_SEED) |
|
seed = int(seed) |
|
generator = torch.Generator().manual_seed(seed) |
|
|
|
prompt = str(translator.translate(prompt, 'English')) |
|
|
|
print(f'prompt:{prompt}') |
|
|
|
audio = pipe( |
|
prompt, |
|
negative_prompt=negative, |
|
audio_end_in_s=second, |
|
num_inference_steps=200, |
|
num_waveforms_per_prompt=3, |
|
generator=generator, |
|
).audios |
|
|
|
os.makedirs("outputs", exist_ok=True) |
|
base_count = len(glob(os.path.join("outputs", "*.mp4"))) |
|
audio_path = os.path.join("outputs", f"{base_count:06d}.wav") |
|
|
|
sf.write(audio_path, audio[0].T.float().cpu().numpy(), pipe.vae.samping_rate) |
|
|
|
return audio_path, seed |
|
|
|
|
|
|
|
with gr.Blocks(theme='soft', css=CSS, js=JS, title="Stable Audio Open") as iface: |
|
with gr.Accordion(""): |
|
gr.Markdown(DESCRIPTION) |
|
output = gr.Audio(label="Podcast", type="filepath", interactive=False, autoplay=True, elem_classes="audio") |
|
prompt = gr.Textbox(label="Prompt", placeholder="1000 BPM percussive sound of water drops") |
|
negative = gr.Textbox(label="Negative prompt", placeholder="Low quality") |
|
with gr.Row(): |
|
second = gr.Slider(5.0, 60.0, value=10.0, label="Second", step=0.1), |
|
seed = gr.Slider(1, MAX_SEED, value=0, label="Seed", step=1), |
|
with gr.Row(): |
|
submit_btn = gr.Button("๐ Send") |
|
clear_btn = gr.ClearButton([prompt, seed, output], value="๐๏ธ Clear") |
|
|
|
|
|
submit_btn.click(main, inputs=[prompt, negative, second, seed], outputs=[output, seed]) |
|
|
|
|
|
|
|
|
|
iface.queue().launch(show_api=False) |