fffiloni commited on
Commit
5dfe08d
1 Parent(s): e4b2d25

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -41
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
 
2
  import io
3
- import typing as T
4
 
5
  import numpy as np
6
  from PIL import Image
@@ -8,36 +8,56 @@ import pydub
8
  from scipy.io import wavfile
9
  import torch
10
  import torchaudio
 
11
 
12
- def convert(audio):
13
- # read uploaded file to wav
14
- rate, data = wavfile.read(audio)
15
-
16
- # resample from 48000 to 44100
17
- from scipy.signal import resample
18
- data = resample(data, int(data.shape[0] * 44100 / 48000))
19
-
20
- # convert to mono
21
- data = np.mean(data, axis=0)
22
-
23
- # convert to float32
24
- data = data.astype(np.float32)
25
-
26
- # take a random 7 second slice of the audio
27
- data = data[rate*7:rate*14]
28
-
29
- spectrogram = spectrogram_from_waveform(
30
- waveform=data,
31
- sample_rate=rate,
32
- # width=768,
33
- n_fft=8192,
34
- hop_length=512,
35
- win_length=8192,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  )
37
 
38
- spec = image_from_spectrogram(spectrogram)
 
39
 
40
- return spec
41
 
42
  def spectrogram_from_waveform(
43
  waveform: np.ndarray,
@@ -80,28 +100,51 @@ def spectrogram_from_waveform(
80
  return Sxx_mag
81
 
82
  def image_from_spectrogram(
83
- spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
 
 
84
  ) -> Image.Image:
 
 
 
 
 
 
 
 
85
  """
86
- Compute a spectrogram image from a spectrogram magnitude array.
87
  """
88
- # Apply the power curve
89
- data = np.power(spectrogram, power_for_image)
90
 
91
- # Rescale to 0-255
92
- data = data * 255 / max_volume
93
 
94
- # Invert
95
- data = 255 - data
96
 
97
- # Convert to a PIL image
98
- image = Image.fromarray(data.astype(np.uint8))
 
 
 
 
 
 
 
 
99
 
100
- # Flip Y
101
- image = image.transpose(Image.FLIP_TOP_BOTTOM)
 
 
 
102
 
103
- # Convert to RGB
104
- image = image.convert("RGB")
 
 
 
 
105
 
106
  return image
107
 
 
1
  import gradio as gr
2
+
3
  import io
 
4
 
5
  import numpy as np
6
  from PIL import Image
 
8
  from scipy.io import wavfile
9
  import torch
10
  import torchaudio
11
+ import argparse
12
 
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("-i", "--input", help="Input file to process, anything that FFMPEG supports, but wav and mp3 are recommended")
15
+ parser.add_argument("-o", "--output", help="Output Image")
16
+ parser.add_argument("-m", "--maxvol", default=100, help="Max Volume, 255 for identical results")
17
+ parser.add_argument("-p", "--powerforimage", default=0.25, help="Power for Image")
18
+ parser.add_argument("-n", "--nmels", default=512, help="n_mels to use for Image, basically width. Higher = more fidelity")
19
+ args = parser.parse_args()
20
+
21
+ def spectrogram_image_from_wav(wav_bytes: io.BytesIO, max_volume: float = 50, power_for_image: float = 0.25, ms_duration: int = 5119) -> Image.Image:
22
+ """
23
+ Generate a spectrogram image from a WAV file.
24
+ """
25
+ # Read WAV file from bytes
26
+ sample_rate, waveform = wavfile.read(wav_bytes)
27
+
28
+ #sample_rate = 44100 # [Hz]
29
+ clip_duration_ms = ms_duration # [ms]
30
+
31
+ bins_per_image = 512
32
+ n_mels = int(args.nmels)
33
+ mel_scale = True
34
+
35
+ # FFT parameters
36
+ window_duration_ms = 100 # [ms]
37
+ padded_duration_ms = 400 # [ms]
38
+ step_size_ms = 10 # [ms]
39
+
40
+ # Derived parameters
41
+ num_samples = int(512 / float(bins_per_image) * clip_duration_ms) * sample_rate
42
+ n_fft = int(padded_duration_ms / 1000.0 * sample_rate)
43
+ hop_length = int(step_size_ms / 1000.0 * sample_rate)
44
+ win_length = int(window_duration_ms / 1000.0 * sample_rate)
45
+
46
+ # Compute spectrogram from waveform
47
+ Sxx = spectrogram_from_waveform(
48
+ waveform=waveform,
49
+ sample_rate=sample_rate,
50
+ n_fft=n_fft,
51
+ hop_length=hop_length,
52
+ win_length=win_length,
53
+ mel_scale=mel_scale,
54
+ n_mels=n_mels,
55
  )
56
 
57
+ # Convert spectrogram to image
58
+ image = image_from_spectrogram(Sxx, max_volume=max_volume, power_for_image=power_for_image)
59
 
60
+ return image
61
 
62
  def spectrogram_from_waveform(
63
  waveform: np.ndarray,
 
100
  return Sxx_mag
101
 
102
  def image_from_spectrogram(
103
+ data: np.ndarray,
104
+ max_volume: float = 50,
105
+ power_for_image: float = 0.25
106
  ) -> Image.Image:
107
+ data = np.power(data, power_for_image)
108
+ data = data / (max_volume / 255)
109
+ data = 255 - data
110
+ data = data[::-1]
111
+ image = Image.fromarray(data.astype(np.uint8))
112
+ return image
113
+
114
+ def spectrogram_image_from_file(filename, max_volume: float = 50, power_for_image: float = 0.25) -> Image.Image:
115
  """
116
+ Generate a spectrogram image from an MP3 file.
117
  """
 
 
118
 
119
+ max_volume = int(args.maxvol)
120
+ power_for_image = float(args.powerforimage)
121
 
122
+ # Load MP3 file into AudioSegment object
123
+ audio = pydub.AudioSegment.from_file(filename)
124
 
125
+ # Convert to mono and set frame rate
126
+ audio = audio.set_channels(1)
127
+ audio = audio.set_frame_rate(44100)
128
+
129
+ length_in_ms = len(audio)
130
+ print("ORIGINAL AUDIO LENGTH IN MS:", length_in_ms)
131
+ # Extract first 5 seconds of audio data
132
+ audio = audio[:5119]
133
+ length_in_ms = len(audio)
134
+ print("CROPPED AUDIO LENGTH IN MS:", length_in_ms)
135
 
136
+ # Convert to WAV and save as BytesIO object
137
+ wav_bytes = io.BytesIO()
138
+ audio.export("clip.wav", format="wav")
139
+ audio.export(wav_bytes, format="wav")
140
+ wav_bytes.seek(0)
141
 
142
+ # Generate spectrogram image from WAV file
143
+ return spectrogram_image_from_wav(wav_bytes, max_volume=max_volume, power_for_image=power_for_image, ms_duration=length_in_ms)
144
+
145
+ def convert(audio):
146
+
147
+ image = spectrogram_image_from_file(filename)
148
 
149
  return image
150