JustinLin610's picture
first commit
ee21b96
raw
history blame
2.57 kB
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import sys
from copy import deepcopy
from scipy.signal import lfilter
import numpy as np
from tqdm import tqdm
import soundfile as sf
import os.path as osp
def get_parser():
parser = argparse.ArgumentParser(description="compute vad segments")
parser.add_argument(
"--rvad-home",
"-r",
help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)",
required=True,
)
return parser
def rvad(speechproc, path):
winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512
ftThres = 0.5
vadThres = 0.4
opts = 1
data, fs = sf.read(path)
assert fs == 16_000, "sample rate must be 16khz"
ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt)
# --spectral flatness --
pv01 = np.zeros(ft.shape[0])
pv01[np.less_equal(ft, ftThres)] = 1
pitch = deepcopy(ft)
pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts)
# --filtering--
ENERGYFLOOR = np.exp(-50)
b = np.array([0.9770, -0.9770])
a = np.array([1.0000, -0.9540])
fdata = lfilter(b, a, data, axis=0)
# --pass 1--
noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy(
fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk
)
# sets noisy segments to zero
for j in range(n_noise_samp):
fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0
vad_seg = speechproc.snre_vad(
fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres
)
return vad_seg, data
def main():
parser = get_parser()
args = parser.parse_args()
sys.path.append(args.rvad_home)
import speechproc
stride = 160
lines = sys.stdin.readlines()
root = lines[0].rstrip()
for fpath in tqdm(lines[1:]):
path = osp.join(root, fpath.split()[0])
vads, wav = rvad(speechproc, path)
start = None
vad_segs = []
for i, v in enumerate(vads):
if start is None and v == 1:
start = i * stride
elif start is not None and v == 0:
vad_segs.append((start, i * stride))
start = None
if start is not None:
vad_segs.append((start, len(wav)))
print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs))
if __name__ == "__main__":
main()