File size: 4,307 Bytes
9c0c5c8
c17b696
9c0c5c8
d68360d
9c0c5c8
d68360d
 
 
 
9c0c5c8
c17b696
d68360d
 
 
 
 
 
 
 
 
 
 
c17b696
 
 
 
 
 
 
 
 
c1e3d89
c17b696
d68360d
9c0c5c8
 
 
c1e3d89
d68360d
ea68dfd
9c0c5c8
d68360d
 
 
 
 
 
 
 
 
 
 
 
e66133f
c17b696
 
 
e66133f
 
c17b696
9d6c224
61d4556
9d6c224
ea68dfd
 
 
 
d68360d
9c0c5c8
e66133f
c17b696
 
 
 
 
ea68dfd
9c0c5c8
e66133f
 
 
 
 
 
 
 
 
d68360d
e66133f
 
c17b696
 
 
 
 
9c0c5c8
 
e66133f
c17b696
 
 
 
 
 
 
 
d68360d
 
 
9c0c5c8
d68360d
327bccf
9c0c5c8
 
e66133f
c17b696
 
 
 
 
 
e66133f
c17b696
d68360d
9c0c5c8
 
 
d68360d
 
9c0c5c8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import warnings


warnings.filterwarnings("ignore")

import numpy as np  # noqa: E402

import librosa  # noqa: E402
from PIL import Image  # noqa: E402


class Mel:
    def __init__(
        self,
        x_res: int = 256,
        y_res: int = 256,
        sample_rate: int = 22050,
        n_fft: int = 2048,
        hop_length: int = 512,
        top_db: int = 80,
        n_iter: int = 32,
    ):
        """Class to convert audio to mel spectrograms and vice versa.

        Args:
            x_res (int): x resolution of spectrogram (time)
            y_res (int): y resolution of spectrogram (frequency bins)
            sample_rate (int): sample rate of audio
            n_fft (int): number of Fast Fourier Transforms
            hop_length (int): hop length (a higher number is recommended for lower than 256 y_res)
            top_db (int): loudest in decibels
            n_iter (int): number of iterations for Griffin Linn mel inversion
        """
        self.hop_length = hop_length
        self.sr = sample_rate
        self.n_fft = n_fft
        self.top_db = top_db
        self.n_iter = n_iter
        self.set_resolution(x_res, y_res)
        self.audio = None

    def set_resolution(self, x_res: int, y_res: int):
        """Set resolution.

        Args:
            x_res (int): x resolution of spectrogram (time)
            y_res (int): y resolution of spectrogram (frequency bins)
        """
        self.x_res = x_res
        self.y_res = y_res
        self.n_mels = self.y_res
        self.slice_size = self.x_res * self.hop_length - 1

    def load_audio(self, audio_file: str = None, raw_audio: np.ndarray = None):
        """Load audio.

        Args:
            audio_file (str): must be a file on disk due to Librosa limitation or
            raw_audio (np.ndarray): audio as numpy array
        """
        if audio_file is not None:
            self.audio, _ = librosa.load(audio_file, mono=True, sr=self.sr)
        else:
            self.audio = raw_audio

        # Pad with silence if necessary.
        if len(self.audio) < self.x_res * self.hop_length:
            self.audio = np.concatenate([self.audio, np.zeros((self.x_res * self.hop_length - len(self.audio),))])

    def get_number_of_slices(self) -> int:
        """Get number of slices in audio.

        Returns:
            int: number of spectograms audio can be sliced into
        """
        return len(self.audio) // self.slice_size

    def get_audio_slice(self, slice: int = 0) -> np.ndarray:
        """Get slice of audio.

        Args:
            slice (int): slice number of audio (out of get_number_of_slices())

        Returns:
            np.ndarray: audio as numpy array
        """
        return self.audio[self.slice_size * slice : self.slice_size * (slice + 1)]

    def get_sample_rate(self) -> int:
        """Get sample rate:

        Returns:
            int: sample rate of audio
        """
        return self.sr

    def audio_slice_to_image(self, slice: int) -> Image.Image:
        """Convert slice of audio to spectrogram.

        Args:
            slice (int): slice number of audio to convert (out of get_number_of_slices())

        Returns:
            PIL Image: grayscale image of x_res x y_res
        """
        S = librosa.feature.melspectrogram(
            y=self.get_audio_slice(slice), sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels
        )
        log_S = librosa.power_to_db(S, ref=np.max, top_db=self.top_db)
        bytedata = (((log_S + self.top_db) * 255 / self.top_db).clip(0, 255) + 0.5).astype(np.uint8)
        image = Image.fromarray(bytedata)
        return image

    def image_to_audio(self, image: Image.Image) -> np.ndarray:
        """Converts spectrogram to audio.

        Args:
            image (PIL Image): x_res x y_res grayscale image

        Returns:
            audio (np.ndarray): raw audio
        """
        bytedata = np.frombuffer(image.tobytes(), dtype="uint8").reshape((image.height, image.width))
        log_S = bytedata.astype("float") * self.top_db / 255 - self.top_db
        S = librosa.db_to_power(log_S)
        audio = librosa.feature.inverse.mel_to_audio(
            S, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_iter=self.n_iter
        )
        return audio