Spaces:
Runtime error
Runtime error
Hendrik Schroeter
commited on
Commit
•
4f235e3
1
Parent(s):
897b496
Force mono audio with max len of 10s
Browse files
app.py
CHANGED
@@ -41,7 +41,7 @@ def mix_at_snr(clean, noise, snr, eps=1e-10):
|
|
41 |
if noise.shape[1] < clean.shape[1]:
|
42 |
noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
|
43 |
max_start = int(noise.shape[1] - clean.shape[1])
|
44 |
-
start = torch.randint(0, max_start, ()).item()
|
45 |
logger.debug(f"start: {start}, {clean.shape}")
|
46 |
noise = noise[:, start : start + clean.shape[1]]
|
47 |
E_speech = torch.mean(clean.pow(2)) + eps
|
@@ -92,6 +92,7 @@ def mix_and_denoise(
|
|
92 |
if noise_fn is None:
|
93 |
noise_fn = "samples/dkitchen.wav"
|
94 |
meta = AudioMetaData(-1, -1, -1, -1, "")
|
|
|
95 |
if speech_rec is None and speech_upl is None:
|
96 |
speech, meta = load_audio("samples/p232_013_clean.wav", sr)
|
97 |
elif speech_upl is not None:
|
@@ -100,6 +101,12 @@ def mix_and_denoise(
|
|
100 |
tmp = load_audio_gradio(speech_rec, sr)
|
101 |
assert tmp is not None
|
102 |
speech, meta = tmp
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
logger.info(f"Loaded speech with shape {speech.shape}")
|
104 |
noise, _ = load_audio(noise_fn, sr) # type: ignore
|
105 |
if meta.sample_rate != sr:
|
|
|
41 |
if noise.shape[1] < clean.shape[1]:
|
42 |
noise = noise.repeat((1, int(math.ceil(clean.shape[1] / noise.shape[1]))))
|
43 |
max_start = int(noise.shape[1] - clean.shape[1])
|
44 |
+
start = torch.randint(0, max_start, ()).item() if max_start > 0 else 0
|
45 |
logger.debug(f"start: {start}, {clean.shape}")
|
46 |
noise = noise[:, start : start + clean.shape[1]]
|
47 |
E_speech = torch.mean(clean.pow(2)) + eps
|
|
|
92 |
if noise_fn is None:
|
93 |
noise_fn = "samples/dkitchen.wav"
|
94 |
meta = AudioMetaData(-1, -1, -1, -1, "")
|
95 |
+
max_s = 10 # limit to 10 seconds
|
96 |
if speech_rec is None and speech_upl is None:
|
97 |
speech, meta = load_audio("samples/p232_013_clean.wav", sr)
|
98 |
elif speech_upl is not None:
|
|
|
101 |
tmp = load_audio_gradio(speech_rec, sr)
|
102 |
assert tmp is not None
|
103 |
speech, meta = tmp
|
104 |
+
if speech.dim() > 1 and speech.shape[0] > 1:
|
105 |
+
assert (
|
106 |
+
speech.shape[1] > speech.shape[0]
|
107 |
+
), f"Expecting channels first, but got {speech.shape}"
|
108 |
+
speech = speech.mean(dim=0, keepdim=True)
|
109 |
+
speech = speech[..., : max_s * sr]
|
110 |
logger.info(f"Loaded speech with shape {speech.shape}")
|
111 |
noise, _ = load_audio(noise_fn, sr) # type: ignore
|
112 |
if meta.sample_rate != sr:
|
usage.md
CHANGED
@@ -4,6 +4,7 @@ This demo takes a speech sample and a noise sample and mixes them at the provide
|
|
4 |
You can either record a speech sample or alternatively provide one via upload.
|
5 |
Furthermore, you may upload a noise sample which will be mixed with the speech sample.
|
6 |
If no samples are provided, a default will be used.
|
|
|
7 |
|
8 |
DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.
|
9 |
|
|
|
4 |
You can either record a speech sample or alternatively provide one via upload.
|
5 |
Furthermore, you may upload a noise sample which will be mixed with the speech sample.
|
6 |
If no samples are provided, a default will be used.
|
7 |
+
Long audio samples will be trimmed to 10s.
|
8 |
|
9 |
DeepFilterNet [(link)](https://github.com/Rikorose/DeepFilterNet) is used to denoise the noisy mixture.
|
10 |
|