Spaces:

hilamanor
/

audioEditing

Running on Zero

hilamanor commited on Sep 22, 2024

Commit

511e6ea

1 Parent(s): c5c715d

swap to gradio 4.44 & add adaptive duration

Files changed (4) hide show

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🎧
 colorFrom: indigo
 colorTo: gray
 sdk: gradio
-sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 license: cc-by-sa-4.0
@@ -12,4 +12,4 @@ short_description: Edit audios with text prompts
 ---
 The 30-second limit was introduced to ensure that queue wait times remain reasonable, especially when there are a lot of users.
-For that reason pull-requests that change this limit will not be merged. Please clone or duplicate the space to work locally without limits.

 colorFrom: indigo
 colorTo: gray
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
 license: cc-by-sa-4.0
 ---
 The 30-second limit was introduced to ensure that queue wait times remain reasonable, especially when there are a lot of users.
+For that reason pull-requests that change this limit will not be merged. Please clone or duplicate the space to work locally without limits.

app.py CHANGED Viewed

@@ -73,7 +73,31 @@ def sample(ldm_stable, zs, wts, steps, prompt_tar, tstart, cfg_scale_tar):  # ,
     return (16000, audio.squeeze().cpu().numpy())
-@spaces.GPU(duration=200)
 def edit(
     # cache_dir,
     input_audio,

     return (16000, audio.squeeze().cpu().numpy())
+def get_duration(input_audio, model_id: str, do_inversion: bool,
+    wts: Optional[torch.Tensor], zs: Optional[torch.Tensor],
+    saved_inv_model: str, source_prompt="", target_prompt="",
+    steps=200, cfg_scale_src=3.5, cfg_scale_tar=12, t_start=45, randomize_seed=True):
+    if model_id == LDM2:
+        factor = 0.8
+    elif model_id == LDM2_LARGE:
+        factor = 1.5
+    else:  # MUSIC
+        factor = 1
+    mult = 0
+    if do_inversion or randomize_seed:
+        mult = steps
+    if input_audio is None:
+        raise gr.Error('Input audio missing!')
+    duration = min(utils.get_duration(input_audio), 30)
+    time_per_iter_of_full = factor * ((t_start /100 * steps)*2 + mult) * 0.2
+    print('expected time:', time_per_iter_of_full / 30 * duration)
+    return time_per_iter_of_full / 30 * duration
+@spaces.GPU(duration=get_duration)
 def edit(
     # cache_dir,
     input_audio,

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 torch
 torchaudio
 diffusers
 accelerate

 torch
+numpy<2
 torchaudio
 diffusers
 accelerate

utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import numpy as np
 import torch
 from typing import Optional, List, Tuple, NamedTuple, Union
 from models import PipelineWrapper
 class PromptEmbeddings(NamedTuple):
@@ -16,7 +17,7 @@ def load_audio(audio_path: Union[str, np.array], fn_STFT, left: int = 0, right:
         import audioldm
         import audioldm.audio
-        duration = min(audioldm.utils.get_duration(audio_path), 30)
         mel, _, _ = audioldm.audio.wav_to_fbank(audio_path, target_length=int(duration * 102.4), fn_STFT=fn_STFT)
         mel = mel.unsqueeze(0)

 import torch
 from typing import Optional, List, Tuple, NamedTuple, Union
 from models import PipelineWrapper
+from audioldm.utils import get_duration
 class PromptEmbeddings(NamedTuple):
         import audioldm
         import audioldm.audio
+        duration = min(get_duration(audio_path), 30)
         mel, _, _ = audioldm.audio.wav_to_fbank(audio_path, target_length=int(duration * 102.4), fn_STFT=fn_STFT)
         mel = mel.unsqueeze(0)