JustinLin610's picture
first commit
ee21b96
raw
history blame
2.18 kB
import os
import shlex
import subprocess
import progressbar
from time import time
from pathlib import Path
def find_all_files(path_dir, extension):
out = []
for root, dirs, filenames in os.walk(path_dir):
for f in filenames:
if f.endswith(extension):
out.append(((str(Path(f).stem)), os.path.join(root, f)))
return out
def convert16k(inputfile, outputfile16k):
command = ('sox -c 1 -b 16 {} -t wav {} rate 16k'.format(inputfile, outputfile16k))
subprocess.call(shlex.split(command))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Convert to wav 16k audio using sox.')
parser.add_argument('input_dir', type=str,
help='Path to the input dir.')
parser.add_argument('output_dir', type=str,
help='Path to the output dir.')
parser.add_argument('--extension', type=str, default='wav',
help='Audio file extension in the input. Default: mp3')
args = parser.parse_args()
# Find all sequences
print(f"Finding all audio files with extension '{args.extension}' from {args.input_dir}...")
audio_files = find_all_files(args.input_dir, args.extension)
print(f"Done! Found {len(audio_files)} files.")
# Convert to relative path
audio_files = [os.path.relpath(file[-1], start=args.input_dir) for file in audio_files]
# Create all the directories needed
rel_dirs_set = set([os.path.dirname(file) for file in audio_files])
for rel_dir in rel_dirs_set:
Path(os.path.join(args.output_dir, rel_dir)).mkdir(parents=True, exist_ok=True)
# Converting wavs files
print("Converting the audio to wav files...")
bar = progressbar.ProgressBar(maxval=len(audio_files))
bar.start()
start_time = time()
for index, file in enumerate(audio_files):
bar.update(index)
input_file = os.path.join(args.input_dir, file)
output_file = os.path.join(args.output_dir, os.path.splitext(file)[0]+".wav")
convert16k(input_file, output_file)
bar.finish()
print(f"...done {len(audio_files)} files in {time()-start_time} seconds.")