j
made description less misleading
b6dcb42
import sys
print(sys.path)
sys.path.append('/home/user/audio_ai/diffusers_harp/venv/src')
from pyharp import ModelCard, build_endpoint, save_and_return_filepath
from audiotools import AudioSignal
import scipy
import torch
import gradio as gr
from diffusers import AudioLDM2Pipeline
import subprocess as sp
#harp_deps = [
#"descript-audiotools"]
#
#try:
# from pyharp import ModelCard, build_endpoint, save_and_return_filepath
#except ImportError:
# print("Installing harp dependencies...")
# sp.check_call(["pip", "install", *harp_deps])
# sp.check_call(["pip", "install", "-e git+https://github.com/audacitorch/pyharp.git#egg=pyharp"])
# sp.check_call(["pip", "install", "pydantic<2.0.0"])
# from pyharp import ModelCard, build_endpoint, save_and_return_filepath
# Create a Model Card
card = ModelCard(
name='Diffusers AudioLDM2 Generation',
description='AudioLDM2 text-to-audio generation, operates on region selected in track. Not conditioned on selected audio, simply replaces audio in source track with generation.',
author='Team Audio',
tags=['AudioLDM', 'Diffusers', 'Generation']
)
# Load the model
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
def process_fn(input_audio_path, prompt, negative_prompt, seed, num_inference_steps, audio_length_in_s, num_waveforms_per_prompt):
"""
This function defines the audio processing steps
Args:
input_audio_path (str): the audio filepath to be processed.
<YOUR_KWARGS>: additional keyword arguments necessary for processing.
NOTE: These should correspond to and match order of UI elements defined below.
Returns:
output_audio_path (str): the filepath of the processed audio.
"""
sig = AudioSignal(input_audio_path)
outfile = "./output.wav"
#prompt = "The sound of a hammer hitting a wooden surface."
#negative_prompt = "Low quality."
# set the seed for generator
generator = torch.Generator("cuda").manual_seed(int(seed))
audio = pipe(
prompt,
negative_prompt=negative_prompt,
num_inference_steps=int(num_inference_steps),
audio_length_in_s=audio_length_in_s,
num_waveforms_per_prompt=int(num_waveforms_per_prompt),
generator=generator,
).audios
scipy.io.wavfile.write(outfile, rate=16000, data=audio[0])
return outfile
# Build the endpoint
with gr.Blocks() as webapp:
# Define your Gradio interface
inputs = [
gr.Audio(
label="Audio Input",
type="filepath"
),
gr.Text(
label="Prompt",
interactive=True
),
gr.Text(
label="Negative Prompt",
interactive=True
),
gr.Slider(
label="seed",
minimum="0",
maximum="65535",
value="0",
step="1"
),
gr.Slider(
minimum=1, maximum=500,
step=1, value=1,
label="Inference Steps"
),
gr.Slider(
minimum=2.5, maximum=10.0,
step=2.5, value=2.5,
label="Duration"
),
gr.Slider(
minimum=1, maximum=10,
step=1, value=1,
label="Waveforms Per Prompt"
),
]
# make an output audio widget
output = gr.Audio(label="Audio Output", type="filepath")
# Build the endpoint
ctrls_data, ctrls_button, process_button, cancel_button = build_endpoint(inputs, output, process_fn, card)
#webapp.queue()
webapp.launch(share=True)