Spaces:
Runtime error
Runtime error
File size: 4,307 Bytes
9c0c5c8 c17b696 9c0c5c8 d68360d 9c0c5c8 d68360d 9c0c5c8 c17b696 d68360d c17b696 c1e3d89 c17b696 d68360d 9c0c5c8 c1e3d89 d68360d ea68dfd 9c0c5c8 d68360d e66133f c17b696 e66133f c17b696 9d6c224 61d4556 9d6c224 ea68dfd d68360d 9c0c5c8 e66133f c17b696 ea68dfd 9c0c5c8 e66133f d68360d e66133f c17b696 9c0c5c8 e66133f c17b696 d68360d 9c0c5c8 d68360d 327bccf 9c0c5c8 e66133f c17b696 e66133f c17b696 d68360d 9c0c5c8 d68360d 9c0c5c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import warnings
warnings.filterwarnings("ignore")
import numpy as np # noqa: E402
import librosa # noqa: E402
from PIL import Image # noqa: E402
class Mel:
def __init__(
self,
x_res: int = 256,
y_res: int = 256,
sample_rate: int = 22050,
n_fft: int = 2048,
hop_length: int = 512,
top_db: int = 80,
n_iter: int = 32,
):
"""Class to convert audio to mel spectrograms and vice versa.
Args:
x_res (int): x resolution of spectrogram (time)
y_res (int): y resolution of spectrogram (frequency bins)
sample_rate (int): sample rate of audio
n_fft (int): number of Fast Fourier Transforms
hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
top_db (int): loudest in decibels
n_iter (int): number of iterations for Griffin Linn mel inversion
"""
self.hop_length = hop_length
self.sr = sample_rate
self.n_fft = n_fft
self.top_db = top_db
self.n_iter = n_iter
self.set_resolution(x_res, y_res)
self.audio = None
def set_resolution(self, x_res: int, y_res: int):
"""Set resolution.
Args:
x_res (int): x resolution of spectrogram (time)
y_res (int): y resolution of spectrogram (frequency bins)
"""
self.x_res = x_res
self.y_res = y_res
self.n_mels = self.y_res
self.slice_size = self.x_res * self.hop_length - 1
def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
"""Load audio.
Args:
audio_file (str): must be a file on disk due to Librosa limitation or
raw_audio (np.ndarray): audio as numpy array
"""
if audio_file is not None:
self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
else:
self.audio = raw_audio
# Pad with silence if necessary.
if len(self.audio) < self.x_res * self.hop_length:
self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])
def get_number_of_slices(self) -> int:
"""Get number of slices in audio.
Returns:
int: number of spectograms audio can be sliced into
"""
return len(self.audio) // self.slice_size
def get_audio_slice(self, slice: int = 0) -> np.ndarray:
"""Get slice of audio.
Args:
slice (int): slice number of audio (out of get_number_of_slices())
Returns:
np.ndarray: audio as numpy array
"""
return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]
def get_sample_rate(self) -> int:
"""Get sample rate:
Returns:
int: sample rate of audio
"""
return self.sr
def audio_slice_to_image(self, slice: int) -> Image.Image:
"""Convert slice of audio to spectrogram.
Args:
slice (int): slice number of audio to convert (out of get_number_of_slices())
Returns:
PIL Image: grayscale image of x_res x y_res
"""
S = librosa.feature.melspectrogram(
y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
)
log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
image = Image.fromarray(bytedata)
return image
def image_to_audio(self, image: Image.Image) -> np.ndarray:
"""Converts spectrogram to audio.
Args:
image (PIL Image): x_res x y_res grayscale image
Returns:
audio (np.ndarray): raw audio
"""
bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
S = librosa.db_to_power(log_S)
audio = librosa.feature.inverse.mel_to_audio(
S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
)
return audio
|