Khalida1w commited on
Commit
7f24723
1 Parent(s): fd4a352

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -139
app.py CHANGED
@@ -9,44 +9,32 @@ from scipy.io import wavfile
9
  import gradio as gr
10
 
11
  def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
12
- """This function take an audio and split into several frame
13
- in a numpy matrix of size (nb_frame,frame_length)"""
14
-
15
  sequence_sample_length = sound_data.shape[0]
16
-
17
- sound_data_list = [sound_data[start:start + frame_length] for start in range(
18
- 0, sequence_sample_length - frame_length + 1, hop_length_frame)] # get sliding windows
 
19
  sound_data_array = np.vstack(sound_data_list)
20
-
21
  return sound_data_array
22
 
23
-
24
  def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
25
- """This function take audio files of a directory and merge them
26
- in a numpy matrix of size (nb_frame,frame_length) for a sliding window of size hop_length_frame"""
27
-
28
  list_sound_array = []
29
-
30
  for file in list_audio_files:
31
- # open the audio file
32
  y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
33
  total_duration = librosa.get_duration(y=y, sr=sr)
34
 
35
- if (total_duration >= min_duration):
36
- list_sound_array.append(audio_to_audio_frame_stack(
37
- y, frame_length, hop_length_frame))
38
  else:
39
- print(
40
- f"The following file {os.path.join(audio_dir,file)} is below the min duration")
41
-
42
- return np.vstack(list_sound_array)
43
-
44
 
45
  def blend_noise_randomly(voice, noise, nb_samples, frame_length):
46
- """This function takes as input numpy arrays representing frames
47
- of voice sounds, noise sounds and the number of frames to be created
48
- and return numpy arrays with voice randomly blend with noise"""
49
-
50
  prod_voice = np.zeros((nb_samples, frame_length))
51
  prod_noise = np.zeros((nb_samples, frame_length))
52
  prod_noisy_voice = np.zeros((nb_samples, frame_length))
@@ -61,188 +49,196 @@ def blend_noise_randomly(voice, noise, nb_samples, frame_length):
61
 
62
  return prod_voice, prod_noise, prod_noisy_voice
63
 
64
-
65
  def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
66
- """This function takes an audio and convert into spectrogram,
67
- it returns the magnitude in dB and the phase"""
68
-
69
  stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
70
  stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)
71
-
72
- stftaudio_magnitude_db = librosa.amplitude_to_db(
73
- stftaudio_magnitude, ref=np.max)
74
-
75
  return stftaudio_magnitude_db, stftaudio_phase
76
 
77
-
78
  def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
79
- """This function takes as input a numpy audi of size (nb_frame,frame_length), and return
80
- a numpy containing the matrix spectrogram for amplitude in dB and phase. It will have the size
81
- (nb_frame,dim_square_spec,dim_square_spec)"""
82
-
83
  nb_audio = numpy_audio.shape[0]
84
-
85
  m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
86
  m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
87
 
88
  for i in range(nb_audio):
89
  m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
90
  n_fft, hop_length_fft, numpy_audio[i])
91
-
92
  return m_mag_db, m_phase
93
 
94
-
95
  def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
96
- """This functions reverts a spectrogram to an audio"""
97
-
98
  stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)
99
-
100
- # taking magnitude and phase of audio
101
  audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
102
- audio_reconstruct = librosa.core.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
103
-
104
  return audio_reconstruct
105
 
106
- def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft) :
107
- """This functions reverts the matrix spectrograms to numpy audio"""
108
-
109
  list_audio = []
110
-
111
  nb_spec = m_mag_db.shape[0]
112
 
113
  for i in range(nb_spec):
114
-
115
- audio_reconstruct = magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
116
  list_audio.append(audio_reconstruct)
117
-
118
  return np.vstack(list_audio)
119
 
120
  def scaled_in(matrix_spec):
121
- "global scaling apply to noisy voice spectrograms (scale between -1 and 1)"
122
- matrix_spec = (matrix_spec + 46)/50
123
  return matrix_spec
124
 
125
  def scaled_ou(matrix_spec):
126
- "global scaling apply to noise models spectrograms (scale between -1 and 1)"
127
- matrix_spec = (matrix_spec -6 )/82
128
  return matrix_spec
129
 
130
  def inv_scaled_in(matrix_spec):
131
- "inverse global scaling apply to noisy voices spectrograms"
132
  matrix_spec = matrix_spec * 50 - 46
133
  return matrix_spec
134
 
135
  def inv_scaled_ou(matrix_spec):
136
- "inverse global scaling apply to noise models spectrograms"
137
  matrix_spec = matrix_spec * 82 + 6
138
  return matrix_spec
139
 
140
-
141
  def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
142
- audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
143
- """ This function takes as input pretrained weights, noisy voice sound to denoise, predict
144
- the denoise sound and save it to disk.
145
- """
146
-
147
- # load json and create model
148
- json_file = open(weights_path+'/'+name_model+'.json', 'r')
149
  loaded_model_json = json_file.read()
150
  json_file.close()
151
  loaded_model = model_from_json(loaded_model_json)
152
- # load weights into new model
153
- loaded_model.load_weights(weights_path+'/'+name_model+'.h5')
154
  print("Loaded model from disk")
155
 
156
- # Extracting noise and voice from folder and convert to numpy
157
- audio = audio_files_to_numpy(audio_dir_prediction, audio_input_prediction, sample_rate,
158
- frame_length, hop_length_frame, min_duration)
159
- # audio = audioData
160
- #Dimensions of squared spectrogram
161
- dim_square_spec = int(n_fft / 2) + 1
162
- print(dim_square_spec)
 
 
163
 
164
- # Create Amplitude and phase of the sounds
165
- m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(
166
- audio, dim_square_spec, n_fft, hop_length_fft)
167
 
168
- #global scaling to have distribution -1/1
 
 
 
 
169
  X_in = scaled_in(m_amp_db_audio)
170
- #Reshape for prediction
171
- X_in = X_in.reshape(X_in.shape[0],X_in.shape[1],X_in.shape[2],1)
172
- #Prediction using loaded network
 
173
  X_pred = loaded_model.predict(X_in)
174
- #Rescale back the noise model
175
  inv_sca_X_pred = inv_scaled_ou(X_pred)
176
- #Remove noise model from noisy speech
177
- X_denoise = m_amp_db_audio - inv_sca_X_pred[:,:,:,0]
178
- #Reconstruct audio from denoised spectrogram and phase
179
- print(X_denoise.shape)
180
- print(m_pha_audio.shape)
181
- print(frame_length)
182
- print(hop_length_fft)
183
  audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
184
- #Number of frames
 
185
  nb_samples = audio_denoise_recons.shape[0]
186
- #Save all frames in one file
187
- denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length)*10
188
- # librosa.output.write_wav(dir_save_prediction + audio_output_prediction, denoise_long[0, :], sample_rate)
189
- print(audio_output_prediction)
190
- sf.write(audio_output_prediction , denoise_long[0, :], sample_rate)
191
-
192
- def denoise_audio(audioName):
193
-
194
- sr, data = audioName
195
- sf.write("temp.wav",data, sr)
196
- testNo = "temp"
197
- audio_dir_prediction = os.path.abspath("/")+ str(testNo) +".wav"
198
- sample_rate, data = audioName[0], audioName[1]
199
- len_data = len(data) # holds length of the numpy array
200
-
201
-
202
-
203
-
204
-
205
- t = len_data / sample_rate # returns duration but in floats
206
- print("t:",t)
 
 
 
 
 
 
 
 
 
 
 
 
207
  weights_path = os.path.abspath("./")
208
  name_model = "model_unet"
209
  audio_dir_prediction = os.path.abspath("./")
210
  dir_save_prediction = os.path.abspath("./")
211
  audio_output_prediction = "test.wav"
212
- audio_input_prediction = ["temp.wav"]
213
- sample_rate = 8000
214
  min_duration = t
215
  frame_length = 8064
216
  hop_length_frame = 8064
217
  n_fft = 255
218
  hop_length_fft = 63
219
 
220
- dim_square_spec = int(n_fft / 2) + 1
221
-
222
- prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
223
- audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft)
224
- print(audio_output_prediction)
225
- return audio_output_prediction
226
-
227
-
 
 
 
 
 
 
 
 
 
228
  examples = [
229
  [os.path.abspath("crowdNoise.wav")],
230
  [os.path.abspath("CrowdNoise2.wav")],
231
  [os.path.abspath("whiteNoise.wav")]
232
  ]
233
 
234
-
235
-
236
- iface = gr.Interface(fn = denoise_audio,
237
- inputs = 'audio',
238
- outputs = 'audio',
239
- title = 'audio to denoised Audio Application',
240
- description = 'A simple application to denoise audio speech using UNet deep learning model. Upload your own audio, or click one of the examples to load them.',
241
- article =
242
- '''<div>
243
- <p style="text-align: center"> All you need to do is to upload the audio file and hit submit, then wait for compiling. After that click on Play/Pause for listing to the audio. The audio is saved in a wav format.</p>
244
- </div>''',
245
- examples=examples
246
- )
247
-
248
- iface.launch()
 
 
 
 
 
 
9
  import gradio as gr
10
 
11
  def audio_to_audio_frame_stack(sound_data, frame_length, hop_length_frame):
12
+ """This function takes an audio and splits it into several frames
13
+ returning a numpy matrix of size (nb_frame, frame_length)."""
 
14
  sequence_sample_length = sound_data.shape[0]
15
+ sound_data_list = [
16
+ sound_data[start:start + frame_length]
17
+ for start in range(0, sequence_sample_length - frame_length + 1, hop_length_frame)
18
+ ]
19
  sound_data_array = np.vstack(sound_data_list)
 
20
  return sound_data_array
21
 
 
22
  def audio_files_to_numpy(audio_dir, list_audio_files, sample_rate, frame_length, hop_length_frame, min_duration):
23
+ """This function takes audio files in a directory and merges them
24
+ into a numpy matrix of size (nb_frame, frame_length) for a sliding window of size hop_length_frame."""
 
25
  list_sound_array = []
 
26
  for file in list_audio_files:
 
27
  y, sr = librosa.load(os.path.join(audio_dir, file), sr=sample_rate)
28
  total_duration = librosa.get_duration(y=y, sr=sr)
29
 
30
+ if total_duration >= min_duration:
31
+ list_sound_array.append(audio_to_audio_frame_stack(y, frame_length, hop_length_frame))
 
32
  else:
33
+ print(f"The following file {os.path.join(audio_dir,file)} is below the min duration")
34
+ return np.vstack(list_sound_array) if len(list_sound_array) > 0 else np.array([])
 
 
 
35
 
36
  def blend_noise_randomly(voice, noise, nb_samples, frame_length):
37
+ """This function randomly blends voice frames with noise frames."""
 
 
 
38
  prod_voice = np.zeros((nb_samples, frame_length))
39
  prod_noise = np.zeros((nb_samples, frame_length))
40
  prod_noisy_voice = np.zeros((nb_samples, frame_length))
 
49
 
50
  return prod_voice, prod_noise, prod_noisy_voice
51
 
 
52
  def audio_to_magnitude_db_and_phase(n_fft, hop_length_fft, audio):
53
+ """Convert audio into a spectrogram, returning the magnitude in dB and the phase."""
 
 
54
  stftaudio = librosa.stft(audio, n_fft=n_fft, hop_length=hop_length_fft)
55
  stftaudio_magnitude, stftaudio_phase = librosa.magphase(stftaudio)
56
+ stftaudio_magnitude_db = librosa.amplitude_to_db(stftaudio_magnitude, ref=np.max)
 
 
 
57
  return stftaudio_magnitude_db, stftaudio_phase
58
 
 
59
  def numpy_audio_to_matrix_spectrogram(numpy_audio, dim_square_spec, n_fft, hop_length_fft):
60
+ """Takes a numpy array of shape (nb_frame, frame_length) and returns
61
+ the matrix spectrogram for amplitude in dB and phase (each of shape (nb_frame, dim_square_spec, dim_square_spec))."""
 
 
62
  nb_audio = numpy_audio.shape[0]
 
63
  m_mag_db = np.zeros((nb_audio, dim_square_spec, dim_square_spec))
64
  m_phase = np.zeros((nb_audio, dim_square_spec, dim_square_spec), dtype=complex)
65
 
66
  for i in range(nb_audio):
67
  m_mag_db[i, :, :], m_phase[i, :, :] = audio_to_magnitude_db_and_phase(
68
  n_fft, hop_length_fft, numpy_audio[i])
 
69
  return m_mag_db, m_phase
70
 
 
71
  def magnitude_db_and_phase_to_audio(frame_length, hop_length_fft, stftaudio_magnitude_db, stftaudio_phase):
72
+ """Reverts a dB spectrogram to audio."""
 
73
  stftaudio_magnitude_rev = librosa.db_to_amplitude(stftaudio_magnitude_db, ref=1.0)
 
 
74
  audio_reverse_stft = stftaudio_magnitude_rev * stftaudio_phase
75
+ audio_reconstruct = librosa.istft(audio_reverse_stft, hop_length=hop_length_fft, length=frame_length)
 
76
  return audio_reconstruct
77
 
78
+ def matrix_spectrogram_to_numpy_audio(m_mag_db, m_phase, frame_length, hop_length_fft):
79
+ """Reverts matrix spectrograms to a stacked numpy audio array."""
 
80
  list_audio = []
 
81
  nb_spec = m_mag_db.shape[0]
82
 
83
  for i in range(nb_spec):
84
+ audio_reconstruct = magnitude_db_and_phase_to_audio(
85
+ frame_length, hop_length_fft, m_mag_db[i], m_phase[i])
86
  list_audio.append(audio_reconstruct)
 
87
  return np.vstack(list_audio)
88
 
89
  def scaled_in(matrix_spec):
90
+ """Global scaling applied to noisy voice spectrograms (scale between -1 and 1)."""
91
+ matrix_spec = (matrix_spec + 46) / 50
92
  return matrix_spec
93
 
94
  def scaled_ou(matrix_spec):
95
+ """Global scaling applied to noise model spectrograms (scale between -1 and 1)."""
96
+ matrix_spec = (matrix_spec - 6) / 82
97
  return matrix_spec
98
 
99
  def inv_scaled_in(matrix_spec):
100
+ """Inverse global scaling applied to noisy voices spectrograms."""
101
  matrix_spec = matrix_spec * 50 - 46
102
  return matrix_spec
103
 
104
  def inv_scaled_ou(matrix_spec):
105
+ """Inverse global scaling applied to noise model spectrograms."""
106
  matrix_spec = matrix_spec * 82 + 6
107
  return matrix_spec
108
 
 
109
  def prediction(weights_path, name_model, audio_dir_prediction, dir_save_prediction, audio_input_prediction,
110
+ audio_output_prediction, sample_rate, min_duration, frame_length, hop_length_frame, n_fft, hop_length_fft):
111
+ """Use pretrained weights to denoise a noisy voice audio, and save the result."""
112
+ # Load model from JSON + weights
113
+ json_file = open(os.path.join(weights_path, name_model + '.json'), 'r')
 
 
 
114
  loaded_model_json = json_file.read()
115
  json_file.close()
116
  loaded_model = model_from_json(loaded_model_json)
117
+ loaded_model.load_weights(os.path.join(weights_path, name_model + '.h5'))
 
118
  print("Loaded model from disk")
119
 
120
+ # Convert audio file(s) to numpy frames
121
+ audio = audio_files_to_numpy(
122
+ audio_dir_prediction,
123
+ audio_input_prediction,
124
+ sample_rate,
125
+ frame_length,
126
+ hop_length_frame,
127
+ min_duration
128
+ )
129
 
130
+ if audio.size == 0:
131
+ print("No valid audio frames found, skipping prediction.")
132
+ return
133
 
134
+ dim_square_spec = int(n_fft / 2) + 1
135
+ # Create amplitude (dB) and phase
136
+ m_amp_db_audio, m_pha_audio = numpy_audio_to_matrix_spectrogram(audio, dim_square_spec, n_fft, hop_length_fft)
137
+
138
+ # Global scaling to get distribution -1 to 1
139
  X_in = scaled_in(m_amp_db_audio)
140
+ # Reshape for model prediction
141
+ X_in = X_in.reshape(X_in.shape[0], X_in.shape[1], X_in.shape[2], 1)
142
+
143
+ # Predict using loaded network
144
  X_pred = loaded_model.predict(X_in)
145
+ # Rescale back the predicted noise
146
  inv_sca_X_pred = inv_scaled_ou(X_pred)
147
+
148
+ # Remove noise model from noisy speech
149
+ X_denoise = m_amp_db_audio - inv_sca_X_pred[:, :, :, 0]
150
+
151
+ # Reconstruct audio
 
 
152
  audio_denoise_recons = matrix_spectrogram_to_numpy_audio(X_denoise, m_pha_audio, frame_length, hop_length_fft)
153
+
154
+ # Combine all frames into a single 1D array, scaled up
155
  nb_samples = audio_denoise_recons.shape[0]
156
+ denoise_long = audio_denoise_recons.reshape(1, nb_samples * frame_length) * 10
157
+
158
+ # Save to disk
159
+ sf.write(audio_output_prediction, denoise_long[0, :], sample_rate)
160
+ print(f"Saved denoised audio to: {audio_output_prediction}")
161
+
162
+ def denoise_audio(audio_input):
163
+ """
164
+ Gradio callback function to denoise audio.
165
+ `audio_input` can be None, a dict {"name", "sample_rate", "data"}, or a tuple (sr, data).
166
+ """
167
+ # 1) Handle None
168
+ if audio_input is None:
169
+ print("No audio was provided.")
170
+ return None
171
+
172
+ # 2) Handle dict vs tuple
173
+ if isinstance(audio_input, dict):
174
+ sr = audio_input["sample_rate"]
175
+ data = audio_input["data"]
176
+ else:
177
+ sr, data = audio_input
178
+
179
+ # Write out to a temp file
180
+ temp_wav = "temp.wav"
181
+ sf.write(temp_wav, data, sr)
182
+
183
+ # Compute duration
184
+ len_data = len(data)
185
+ t = len_data / sr # duration in seconds
186
+ print("t:", t)
187
+
188
+ # Paths & config
189
  weights_path = os.path.abspath("./")
190
  name_model = "model_unet"
191
  audio_dir_prediction = os.path.abspath("./")
192
  dir_save_prediction = os.path.abspath("./")
193
  audio_output_prediction = "test.wav"
194
+ audio_input_prediction = [temp_wav]
195
+ sample_rate = 8000 # model was trained at 8k
196
  min_duration = t
197
  frame_length = 8064
198
  hop_length_frame = 8064
199
  n_fft = 255
200
  hop_length_fft = 63
201
 
202
+ # Run prediction (denoising)
203
+ prediction(weights_path, name_model,
204
+ audio_dir_prediction,
205
+ dir_save_prediction,
206
+ audio_input_prediction,
207
+ audio_output_prediction,
208
+ sample_rate,
209
+ min_duration,
210
+ frame_length,
211
+ hop_length_frame,
212
+ n_fft,
213
+ hop_length_fft)
214
+
215
+ # Return the path to the denoised file so Gradio can play it
216
+ return os.path.abspath(audio_output_prediction)
217
+
218
+ # Example pre-loaded sample files
219
  examples = [
220
  [os.path.abspath("crowdNoise.wav")],
221
  [os.path.abspath("CrowdNoise2.wav")],
222
  [os.path.abspath("whiteNoise.wav")]
223
  ]
224
 
225
+ iface = gr.Interface(
226
+ fn=denoise_audio,
227
+ inputs="audio",
228
+ outputs="audio",
229
+ title="Audio to Denoised Audio Application",
230
+ description=(
231
+ "A simple application to denoise audio speech using a UNet model. "
232
+ "Upload your own audio or click one of the examples to load it."
233
+ ),
234
+ article="""
235
+ <div style="text-align: center">
236
+ <p>All you need to do is to upload or record an audio file and hit 'Submit'.
237
+ After processing, you can click 'Play' to hear the denoised audio.
238
+ The audio is saved in WAV format.</p>
239
+ </div>
240
+ """,
241
+ examples=examples
242
+ )
243
+
244
+ iface.launch()