Spaces:
Runtime error
Runtime error
fix steps for DDIM
Browse files
audiodiffusion/__init__.py
CHANGED
@@ -60,7 +60,7 @@ class AudioDiffusion:
|
|
60 |
|
61 |
def generate_spectrogram_and_audio(
|
62 |
self,
|
63 |
-
steps: int =
|
64 |
generator: torch.Generator = None
|
65 |
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
|
66 |
"""Generate random mel spectrogram and convert to audio.
|
@@ -85,7 +85,7 @@ class AudioDiffusion:
|
|
85 |
raw_audio: np.ndarray = None,
|
86 |
slice: int = 0,
|
87 |
start_step: int = 0,
|
88 |
-
steps: int =
|
89 |
generator: torch.Generator = None,
|
90 |
mask_start_secs: float = 0,
|
91 |
mask_end_secs: float = 0
|
@@ -157,7 +157,7 @@ class AudioDiffusionPipeline(DiffusionPipeline):
|
|
157 |
raw_audio: np.ndarray = None,
|
158 |
slice: int = 0,
|
159 |
start_step: int = 0,
|
160 |
-
steps: int =
|
161 |
generator: torch.Generator = None,
|
162 |
mask_start_secs: float = 0,
|
163 |
mask_end_secs: float = 0
|
@@ -181,8 +181,7 @@ class AudioDiffusionPipeline(DiffusionPipeline):
|
|
181 |
(float, List[np.ndarray]): sample rate and raw audios
|
182 |
"""
|
183 |
|
184 |
-
|
185 |
-
self.scheduler.set_timesteps(steps)
|
186 |
mask = None
|
187 |
images = noise = torch.randn(
|
188 |
(batch_size, self.unet.in_channels, mel.y_res, mel.x_res),
|
@@ -206,9 +205,7 @@ class AudioDiffusionPipeline(DiffusionPipeline):
|
|
206 |
if start_step > 0:
|
207 |
images[0, 0] = self.scheduler.add_noise(
|
208 |
torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
|
209 |
-
noise,
|
210 |
-
torch.tensor(self.scheduler.num_train_timesteps -
|
211 |
-
start_step))
|
212 |
|
213 |
pixels_per_second = (mel.get_sample_rate() / mel.hop_length)
|
214 |
mask_start = int(mask_start_secs * pixels_per_second)
|
|
|
60 |
|
61 |
def generate_spectrogram_and_audio(
|
62 |
self,
|
63 |
+
steps: int = 1000,
|
64 |
generator: torch.Generator = None
|
65 |
) -> Tuple[Image.Image, Tuple[int, np.ndarray]]:
|
66 |
"""Generate random mel spectrogram and convert to audio.
|
|
|
85 |
raw_audio: np.ndarray = None,
|
86 |
slice: int = 0,
|
87 |
start_step: int = 0,
|
88 |
+
steps: int = 1000,
|
89 |
generator: torch.Generator = None,
|
90 |
mask_start_secs: float = 0,
|
91 |
mask_end_secs: float = 0
|
|
|
157 |
raw_audio: np.ndarray = None,
|
158 |
slice: int = 0,
|
159 |
start_step: int = 0,
|
160 |
+
steps: int = 1000,
|
161 |
generator: torch.Generator = None,
|
162 |
mask_start_secs: float = 0,
|
163 |
mask_end_secs: float = 0
|
|
|
181 |
(float, List[np.ndarray]): sample rate and raw audios
|
182 |
"""
|
183 |
|
184 |
+
self.scheduler.set_timesteps(steps)
|
|
|
185 |
mask = None
|
186 |
images = noise = torch.randn(
|
187 |
(batch_size, self.unet.in_channels, mel.y_res, mel.x_res),
|
|
|
205 |
if start_step > 0:
|
206 |
images[0, 0] = self.scheduler.add_noise(
|
207 |
torch.tensor(input_images[:, np.newaxis, np.newaxis, :]),
|
208 |
+
noise, torch.tensor(steps - start_step))
|
|
|
|
|
209 |
|
210 |
pixels_per_second = (mel.get_sample_rate() / mel.hop_length)
|
211 |
mask_start = int(mask_start_secs * pixels_per_second)
|