import shutil import os, sys from subprocess import check_call, check_output import glob import argparse import shutil import pathlib import itertools def call_output(cmd): print(f"Executing: {cmd}") ret = check_output(cmd, shell=True) print(ret) return ret def call(cmd): print(cmd) check_call(cmd, shell=True) WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) SPM_PATH = os.environ.get('SPM_PATH', None) if SPM_PATH is None or not SPM_PATH.strip(): print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...") sys.exit(-1) SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model' SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt' SPM_ENCODE = f'{SPM_PATH}' if not os.path.exists(SPM_MODEL): call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}") if not os.path.exists(SPM_VOCAB): call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}") def get_data_size(raw): cmd = f'wc -l {raw}' ret = call_output(cmd) return int(ret.split()[0]) def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None): src, tgt = direction.split('-') for split in splits: src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}' if os.path.exists(src_raw) and os.path.exists(tgt_raw): cmd = f"""python {SPM_ENCODE} \ --model {model}\ --output_format=piece \ --inputs {src_raw} {tgt_raw} \ --outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """ print(cmd) call(cmd) def binarize_( bpe_dir, databin_dir, direction, spm_vocab=SPM_VOCAB, splits=['train', 'test', 'valid'], ): src, tgt = direction.split('-') try: shutil.rmtree(f'{databin_dir}', ignore_errors=True) os.mkdir(f'{databin_dir}') except OSError as error: print(error) cmds = [ "fairseq-preprocess", f"--source-lang {src} --target-lang {tgt}", f"--destdir {databin_dir}/", f"--workers 8", ] if isinstance(spm_vocab, tuple): src_vocab, tgt_vocab = spm_vocab cmds.extend( [ f"--srcdict {src_vocab}", f"--tgtdict {tgt_vocab}", ] ) else: cmds.extend( [ f"--joined-dictionary", f"--srcdict {spm_vocab}", ] ) input_options = [] if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"): input_options.append( f"--trainpref {bpe_dir}/train.bpe", ) if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"): input_options.append(f"--validpref {bpe_dir}/valid.bpe") if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"): input_options.append(f"--testpref {bpe_dir}/test.bpe") if len(input_options) > 0: cmd = " ".join(cmds + input_options) print(cmd) call(cmd) def binarize( databin_dir, direction, spm_vocab=SPM_VOCAB, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None, ): def move_databin_files(from_folder, to_folder): for bin_file in glob.glob(f"{from_folder}/*.bin") \ + glob.glob(f"{from_folder}/*.idx") \ + glob.glob(f"{from_folder}/dict*"): try: shutil.move(bin_file, to_folder) except OSError as error: print(error) bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin" bpe_dir = f"{BPE_DIR}/{direction}{prefix}" if pairs_per_shard is None: binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits) move_databin_files(bpe_databin_dir, databin_dir) else: # binarize valid and test which will not be sharded binarize_( bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"]) for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"): path_strs = os.path.split(shard_bpe_dir) shard_str = path_strs[-1] shard_folder = f"{bpe_databin_dir}/{shard_str}" databin_shard_folder = f"{databin_dir}/{shard_str}" print(f'working from {shard_folder} to {databin_shard_folder}') os.makedirs(databin_shard_folder, exist_ok=True) binarize_( shard_bpe_dir, shard_folder, direction, spm_vocab=spm_vocab, splits=["train"]) for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"): filename = os.path.split(test_data)[-1] try: os.symlink(test_data, f"{databin_shard_folder}/{filename}") except OSError as error: print(error) move_databin_files(shard_folder, databin_shard_folder) def load_langs(path): with open(path) as fr: langs = [l.strip() for l in fr] return langs if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50") parser.add_argument("--raw-folder", default='raw') parser.add_argument("--bpe-folder", default='bpe') parser.add_argument("--databin-folder", default='databin') args = parser.parse_args() DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50' RAW_DIR = f'{DATA_PATH}/{args.raw_folder}' BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}' DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}' os.makedirs(BPE_DIR, exist_ok=True) raw_files = itertools.chain( glob.glob(f'{RAW_DIR}/train*'), glob.glob(f'{RAW_DIR}/valid*'), glob.glob(f'{RAW_DIR}/test*'), ) directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] for direction in directions: prefix = "" splits = ['train', 'valid', 'test'] try: shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True) os.mkdir(f'{BPE_DIR}/{direction}{prefix}') os.makedirs(DATABIN_DIR, exist_ok=True) except OSError as error: print(error) spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB encode_spm(spm_model, direction=direction, splits=splits) binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits)