File size: 3,079 Bytes
b2eb230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import math
from pathlib import Path
from random import Random

import click
from loguru import logger
from pydub import AudioSegment
from tqdm import tqdm

from tools.file import AUDIO_EXTENSIONS, list_files, load_filelist


@click.command()
@click.argument("root", type=click.Path(exists=True, path_type=Path))
@click.option("--val-ratio", type=float, default=None)
@click.option("--val-count", type=int, default=None)
@click.option("--filelist", default=None, type=Path)
@click.option("--min-duration", default=None, type=float)
@click.option("--max-duration", default=None, type=float)
def main(root, val_ratio, val_count, filelist, min_duration, max_duration):
    if filelist:
        files = [i[0] for i in load_filelist(filelist)]
    else:
        files = list_files(root, AUDIO_EXTENSIONS, recursive=True, sort=True)

    if min_duration is None and max_duration is None:
        filtered_files = list(map(str, [file.relative_to(root) for file in files]))
    else:
        filtered_files = []
        for file in tqdm(files):
            try:
                audio = AudioSegment.from_file(str(file))
                duration = len(audio) / 1000.0

                if min_duration is not None and duration < min_duration:
                    logger.info(
                        f"Skipping {file} due to duration {duration:.2f} < {min_duration:.2f}"
                    )
                    continue

                if max_duration is not None and duration > max_duration:
                    logger.info(
                        f"Skipping {file} due to duration {duration:.2f} > {max_duration:.2f}"
                    )
                    continue

                filtered_files.append(str(file.relative_to(root)))
            except Exception as e:
                logger.info(f"Error processing {file}: {e}")

    logger.info(
        f"Found {len(files)} files, remaining {len(filtered_files)} files after filtering"
    )

    Random(42).shuffle(filtered_files)

    if val_count is None and val_ratio is None:
        logger.info("Validation ratio and count not specified, using min(20%, 100)")
        val_size = min(100, math.ceil(len(filtered_files) * 0.2))
    elif val_count is not None and val_ratio is not None:
        logger.error("Cannot specify both val_count and val_ratio")
        return
    elif val_count is not None:
        if val_count < 1 or val_count > len(filtered_files):
            logger.error("val_count must be between 1 and number of files")
            return
        val_size = val_count
    else:
        val_size = math.ceil(len(filtered_files) * val_ratio)

    logger.info(f"Using {val_size} files for validation")

    with open(root / "vq_train_filelist.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(filtered_files[val_size:]))

    with open(root / "vq_val_filelist.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(filtered_files[:val_size]))

    logger.info("Done")


if __name__ == "__main__":
    main()