Hendrik Schroeter commited on
Commit
4f235e3
1 Parent(s): 897b496

Force mono audio with max len of 10s

Browse files
Files changed (2) hide show
  1. app.py +8 -1
  2. usage.md +1 -0
app.py CHANGED
@@ -41,7 +41,7 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
41
  if noise.shape[1] < clean.shape[1]:
42
  noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
43
  max_start = int(noise.shape[1] - clean.shape[1])
44
- start = torch.randint(0, max_start, ()).item()
45
  logger.debug(f"start: {start}, {clean.shape}")
46
  noise = noise[:, start : start + clean.shape[1]]
47
  E_speech = torch.mean(clean.pow(2)) + eps
@@ -92,6 +92,7 @@ def mix_and_denoise(
92
  if noise_fn is None:
93
  noise_fn = "samples/dkitchen.wav"
94
  meta = AudioMetaData(-1, -1, -1, -1, "")
 
95
  if speech_rec is None and speech_upl is None:
96
  speech, meta = load_audio("samples/p232_013_clean.wav", sr)
97
  elif speech_upl is not None:
@@ -100,6 +101,12 @@ def mix_and_denoise(
100
  tmp = load_audio_gradio(speech_rec, sr)
101
  assert tmp is not None
102
  speech, meta = tmp
 
 
 
 
 
 
103
  logger.info(f"Loaded speech with shape {speech.shape}")
104
  noise, _ = load_audio(noise_fn, sr) # type: ignore
105
  if meta.sample_rate != sr:
 
41
  if noise.shape[1] < clean.shape[1]:
42
  noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
43
  max_start = int(noise.shape[1] - clean.shape[1])
44
+ start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
45
  logger.debug(f"start: {start}, {clean.shape}")
46
  noise = noise[:, start : start + clean.shape[1]]
47
  E_speech = torch.mean(clean.pow(2)) + eps
 
92
  if noise_fn is None:
93
  noise_fn = "samples/dkitchen.wav"
94
  meta = AudioMetaData(-1, -1, -1, -1, "")
95
+ max_s = 10 # limit to 10 seconds
96
  if speech_rec is None and speech_upl is None:
97
  speech, meta = load_audio("samples/p232_013_clean.wav", sr)
98
  elif speech_upl is not None:
 
101
  tmp = load_audio_gradio(speech_rec, sr)
102
  assert tmp is not None
103
  speech, meta = tmp
104
+ if speech.dim() > 1 and speech.shape[0] > 1:
105
+ assert (
106
+ speech.shape[1] > speech.shape[0]
107
+ ), f"Expecting channels first, but got {speech.shape}"
108
+ speech = speech.mean(dim=0, keepdim=True)
109
+ speech = speech[..., : max_s * sr]
110
  logger.info(f"Loaded speech with shape {speech.shape}")
111
  noise, _ = load_audio(noise_fn, sr) # type: ignore
112
  if meta.sample_rate != sr:
usage.md CHANGED
@@ -4,6 +4,7 @@ This demo takes a speech sample and a noise sample and mixes them at the provide
4
  You can either record a speech sample or alternatively provide one via upload.
5
  Furthermore, you may upload a noise sample which will be mixed with the speech sample.
6
  If no samples are provided, a default will be used.
 
7
 
8
  DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.
9
 
 
4
  You can either record a speech sample or alternatively provide one via upload.
5
  Furthermore, you may upload a noise sample which will be mixed with the speech sample.
6
  If no samples are provided, a default will be used.
7
+ Long audio samples will be trimmed to 10s.
8
 
9
  DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.
10