Spaces:
Runtime error
Runtime error
fix imports
Browse files
audiodiffusion/__init__.py
CHANGED
@@ -9,7 +9,7 @@ from tqdm.auto import tqdm
|
|
9 |
# from diffusers import AudioDiffusionPipeline
|
10 |
from .pipeline_audio_diffusion import AudioDiffusionPipeline
|
11 |
|
12 |
-
VERSION = "1.4.
|
13 |
|
14 |
|
15 |
class AudioDiffusion:
|
|
|
9 |
# from diffusers import AudioDiffusionPipeline
|
10 |
from .pipeline_audio_diffusion import AudioDiffusionPipeline
|
11 |
|
12 |
+
VERSION = "1.4.1"
|
13 |
|
14 |
|
15 |
class AudioDiffusion:
|
audiodiffusion/mel.py
CHANGED
@@ -23,8 +23,21 @@ from diffusers.schedulers.scheduling_utils import SchedulerMixin
|
|
23 |
|
24 |
warnings.filterwarnings("ignore")
|
25 |
|
26 |
-
import librosa # noqa: E402
|
27 |
import numpy as np # noqa: E402
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
from PIL import Image # noqa: E402
|
29 |
|
30 |
|
@@ -61,6 +74,9 @@ class Mel(ConfigMixin, SchedulerMixin):
|
|
61 |
self.set_resolution(x_res, y_res)
|
62 |
self.audio = None
|
63 |
|
|
|
|
|
|
|
64 |
def set_resolution(self, x_res: int, y_res: int):
|
65 |
"""Set resolution.
|
66 |
|
@@ -87,12 +103,7 @@ class Mel(ConfigMixin, SchedulerMixin):
|
|
87 |
|
88 |
# Pad with silence if necessary.
|
89 |
if len(self.audio) < self.x_res * self.hop_length:
|
90 |
-
self.audio = np.concatenate(
|
91 |
-
[
|
92 |
-
self.audio,
|
93 |
-
np.zeros((self.x_res * self.hop_length - len(self.audio),)),
|
94 |
-
]
|
95 |
-
)
|
96 |
|
97 |
def get_number_of_slices(self) -> int:
|
98 |
"""Get number of slices in audio.
|
@@ -131,11 +142,7 @@ class Mel(ConfigMixin, SchedulerMixin):
|
|
131 |
`PIL Image`: grayscale image of x_res x y_res
|
132 |
"""
|
133 |
S = librosa.feature.melspectrogram(
|
134 |
-
y=self.get_audio_slice(slice),
|
135 |
-
sr=self.sr,
|
136 |
-
n_fft=self.n_fft,
|
137 |
-
hop_length=self.hop_length,
|
138 |
-
n_mels=self.n_mels,
|
139 |
)
|
140 |
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
|
141 |
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
|
@@ -155,10 +162,6 @@ class Mel(ConfigMixin, SchedulerMixin):
|
|
155 |
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
|
156 |
S = librosa.db_to_power(log_S)
|
157 |
audio = librosa.feature.inverse.mel_to_audio(
|
158 |
-
S,
|
159 |
-
sr=self.sr,
|
160 |
-
n_fft=self.n_fft,
|
161 |
-
hop_length=self.hop_length,
|
162 |
-
n_iter=self.n_iter,
|
163 |
)
|
164 |
return audio
|
|
|
23 |
|
24 |
warnings.filterwarnings("ignore")
|
25 |
|
|
|
26 |
import numpy as np # noqa: E402
|
27 |
+
|
28 |
+
|
29 |
+
try:
|
30 |
+
import librosa # noqa: E402
|
31 |
+
|
32 |
+
_librosa_can_be_imported = True
|
33 |
+
_import_error = ""
|
34 |
+
except Exception as e:
|
35 |
+
_librosa_can_be_imported = False
|
36 |
+
_import_error = (
|
37 |
+
f"Cannot import librosa because {e}. Make sure to correctly install librosa to be able to install it."
|
38 |
+
)
|
39 |
+
|
40 |
+
|
41 |
from PIL import Image # noqa: E402
|
42 |
|
43 |
|
|
|
74 |
self.set_resolution(x_res, y_res)
|
75 |
self.audio = None
|
76 |
|
77 |
+
if not _librosa_can_be_imported:
|
78 |
+
raise ValueError(_import_error)
|
79 |
+
|
80 |
def set_resolution(self, x_res: int, y_res: int):
|
81 |
"""Set resolution.
|
82 |
|
|
|
103 |
|
104 |
# Pad with silence if necessary.
|
105 |
if len(self.audio) < self.x_res * self.hop_length:
|
106 |
+
self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
def get_number_of_slices(self) -> int:
|
109 |
"""Get number of slices in audio.
|
|
|
142 |
`PIL Image`: grayscale image of x_res x y_res
|
143 |
"""
|
144 |
S = librosa.feature.melspectrogram(
|
145 |
+
y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
|
|
|
|
|
|
|
|
|
146 |
)
|
147 |
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
|
148 |
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
|
|
|
162 |
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
|
163 |
S = librosa.db_to_power(log_S)
|
164 |
audio = librosa.feature.inverse.mel_to_audio(
|
165 |
+
S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
|
|
|
|
|
|
|
|
|
166 |
)
|
167 |
return audio
|
audiodiffusion/pipeline_audio_diffusion.py
CHANGED
@@ -21,13 +21,12 @@ from typing import List, Tuple, Union
|
|
21 |
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
-
from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler,
|
25 |
from diffusers.pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
|
26 |
from PIL import Image
|
27 |
|
28 |
from .mel import Mel
|
29 |
|
30 |
-
|
31 |
class AudioDiffusionPipeline(DiffusionPipeline):
|
32 |
"""
|
33 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|
|
|
21 |
|
22 |
import numpy as np
|
23 |
import torch
|
24 |
+
from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, UNet2DConditionModel
|
25 |
from diffusers.pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput
|
26 |
from PIL import Image
|
27 |
|
28 |
from .mel import Mel
|
29 |
|
|
|
30 |
class AudioDiffusionPipeline(DiffusionPipeline):
|
31 |
"""
|
32 |
This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
|