Spaces:
Runtime error
Runtime error
#!/usr/bin/env python3 -u | |
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import argparse | |
import sys | |
from copy import deepcopy | |
from scipy.signal import lfilter | |
import numpy as np | |
from tqdm import tqdm | |
import soundfile as sf | |
import os.path as osp | |
def get_parser(): | |
parser = argparse.ArgumentParser(description="compute vad segments") | |
parser.add_argument( | |
"--rvad-home", | |
"-r", | |
help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)", | |
required=True, | |
) | |
return parser | |
def rvad(speechproc, path): | |
winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512 | |
ftThres = 0.5 | |
vadThres = 0.4 | |
opts = 1 | |
data, fs = sf.read(path) | |
assert fs == 16_000, "sample rate must be 16khz" | |
ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt) | |
# --spectral flatness -- | |
pv01 = np.zeros(ft.shape[0]) | |
pv01[np.less_equal(ft, ftThres)] = 1 | |
pitch = deepcopy(ft) | |
pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts) | |
# --filtering-- | |
ENERGYFLOOR = np.exp(-50) | |
b = np.array([0.9770, -0.9770]) | |
a = np.array([1.0000, -0.9540]) | |
fdata = lfilter(b, a, data, axis=0) | |
# --pass 1-- | |
noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy( | |
fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk | |
) | |
# sets noisy segments to zero | |
for j in range(n_noise_samp): | |
fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0 | |
vad_seg = speechproc.snre_vad( | |
fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres | |
) | |
return vad_seg, data | |
def main(): | |
parser = get_parser() | |
args = parser.parse_args() | |
sys.path.append(args.rvad_home) | |
import speechproc | |
stride = 160 | |
lines = sys.stdin.readlines() | |
root = lines[0].rstrip() | |
for fpath in tqdm(lines[1:]): | |
path = osp.join(root, fpath.split()[0]) | |
vads, wav = rvad(speechproc, path) | |
start = None | |
vad_segs = [] | |
for i, v in enumerate(vads): | |
if start is None and v == 1: | |
start = i * stride | |
elif start is not None and v == 0: | |
vad_segs.append((start, i * stride)) | |
start = None | |
if start is not None: | |
vad_segs.append((start, len(wav))) | |
print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs)) | |
if __name__ == "__main__": | |
main() | |