artificialguybr commited on
Commit
b725215
1 Parent(s): 4c9245b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -1
app.py CHANGED
@@ -12,23 +12,33 @@ from stable_audio_tools.inference.generation import generate_diffusion_cond
12
 
13
  # Load the model outside of the GPU-decorated function
14
  def load_model():
 
15
  model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
 
16
  return model, model_config
17
 
18
  # Function to set up, generate, and process the audio
19
  @spaces.GPU(duration=120) # Allocate GPU only when this function is called
20
  def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
 
 
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
22
 
23
  # Fetch the Hugging Face token from the environment variable
24
  hf_token = os.getenv('HF_TOKEN')
 
25
 
26
  # Use pre-loaded model and configuration
27
  model, model_config = load_model()
28
  sample_rate = model_config["sample_rate"]
29
  sample_size = model_config["sample_size"]
30
 
 
 
31
  model = model.to(device)
 
32
 
33
  # Set up text and timing conditioning
34
  conditioning = [{
@@ -36,8 +46,10 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
36
  "seconds_start": 0,
37
  "seconds_total": seconds_total
38
  }]
 
39
 
40
  # Generate stereo audio
 
41
  output = generate_diffusion_cond(
42
  model,
43
  steps=steps,
@@ -49,17 +61,23 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
49
  sampler_type="dpmpp-3m-sde",
50
  device=device
51
  )
 
52
 
53
  # Rearrange audio batch to a single sequence
54
  output = rearrange(output, "b d n -> d (b n)")
 
55
 
56
  # Peak normalize, clip, convert to int16
57
  output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
 
58
 
59
  # Generate a unique filename for the output
60
  unique_filename = f"output_{uuid.uuid4().hex}.wav"
 
 
61
  # Save to file
62
  torchaudio.save(unique_filename, output, sample_rate)
 
63
 
64
  # Return the path to the generated audio file
65
  return unique_filename
@@ -70,7 +88,7 @@ interface = gr.Interface(
70
  inputs=[
71
  gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
72
  gr.Slider(0, 47, value=30, label="Duration in Seconds"),
73
- gr.Slider(10, 300, value=100, step=10, label="Number of Diffusion Steps"),
74
  gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
75
  ],
76
  outputs=gr.Audio(type="filepath", label="Generated Audio"),
 
12
 
13
  # Load the model outside of the GPU-decorated function
14
  def load_model():
15
+ print("Loading model...")
16
  model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
17
+ print("Model loaded successfully.")
18
  return model, model_config
19
 
20
  # Function to set up, generate, and process the audio
21
  @spaces.GPU(duration=120) # Allocate GPU only when this function is called
22
  def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
23
+ print(f"Prompt received: {prompt}")
24
+ print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
25
+
26
  device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ print(f"Using device: {device}")
28
 
29
  # Fetch the Hugging Face token from the environment variable
30
  hf_token = os.getenv('HF_TOKEN')
31
+ print(f"Hugging Face token: {hf_token}")
32
 
33
  # Use pre-loaded model and configuration
34
  model, model_config = load_model()
35
  sample_rate = model_config["sample_rate"]
36
  sample_size = model_config["sample_size"]
37
 
38
+ print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
39
+
40
  model = model.to(device)
41
+ print("Model moved to device.")
42
 
43
  # Set up text and timing conditioning
44
  conditioning = [{
 
46
  "seconds_start": 0,
47
  "seconds_total": seconds_total
48
  }]
49
+ print(f"Conditioning: {conditioning}")
50
 
51
  # Generate stereo audio
52
+ print("Generating audio...")
53
  output = generate_diffusion_cond(
54
  model,
55
  steps=steps,
 
61
  sampler_type="dpmpp-3m-sde",
62
  device=device
63
  )
64
+ print("Audio generated.")
65
 
66
  # Rearrange audio batch to a single sequence
67
  output = rearrange(output, "b d n -> d (b n)")
68
+ print("Audio rearranged.")
69
 
70
  # Peak normalize, clip, convert to int16
71
  output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
72
+ print("Audio normalized and converted.")
73
 
74
  # Generate a unique filename for the output
75
  unique_filename = f"output_{uuid.uuid4().hex}.wav"
76
+ print(f"Saving audio to file: {unique_filename}")
77
+
78
  # Save to file
79
  torchaudio.save(unique_filename, output, sample_rate)
80
+ print(f"Audio saved: {unique_filename}")
81
 
82
  # Return the path to the generated audio file
83
  return unique_filename
 
88
  inputs=[
89
  gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
90
  gr.Slider(0, 47, value=30, label="Duration in Seconds"),
91
+ gr.Slider(10, 150, value=100, step=10, label="Number of Diffusion Steps"),
92
  gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
93
  ],
94
  outputs=gr.Audio(type="filepath", label="Generated Audio"),