File size: 1,686 Bytes
b2eb230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import random
from multiprocessing import Pool
from pathlib import Path

import click
import librosa
import torch.nn.functional as F
import torchaudio
from tqdm import tqdm

from tools.file import AUDIO_EXTENSIONS, list_files

threshold = 10 ** (-50 / 20.0)


def process(file):
    waveform, sample_rate = torchaudio.load(str(file), backend="sox")
    if waveform.size(0) > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    loudness = librosa.feature.rms(
        y=waveform.numpy().squeeze(), frame_length=2048, hop_length=512, center=True
    )[0]

    for i in range(len(loudness) - 1, 0, -1):
        if loudness[i] > threshold:
            break

    end_silent_time = (len(loudness) - i) * 512 / sample_rate

    if end_silent_time <= 0.3:
        random_time = random.uniform(0.3, 0.7) - end_silent_time
        waveform = F.pad(
            waveform, (0, int(random_time * sample_rate)), mode="constant", value=0
        )

    for i in range(len(loudness)):
        if loudness[i] > threshold:
            break

    start_silent_time = i * 512 / sample_rate

    if start_silent_time > 0.02:
        waveform = waveform[:, int((start_silent_time - 0.02) * sample_rate) :]

    torchaudio.save(uri=str(file), src=waveform, sample_rate=sample_rate)


@click.command()
@click.argument("source", type=Path)
@click.option("--num-workers", type=int, default=12)
def main(source, num_workers):
    files = list(list_files(source, AUDIO_EXTENSIONS, recursive=True))

    with Pool(num_workers) as p:
        list(tqdm(p.imap_unordered(process, files), total=len(files)))


if __name__ == "__main__":
    main()