File size: 1,796 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.



import os
import glob
import argparse
from utils.dedup import deup

import sys
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)

if WORKDIR_ROOT is None or  not WORKDIR_ROOT.strip():
    print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
    sys.exit(-1)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--from-folder", type=str, required=True,
                        help="the data folder to be dedup")
    parser.add_argument("--to-folder", type=str, required=True,
                        help="the data folder to save deduped data")
    parser.add_argument('--directions', type=str, default=None, required=False)

    args = parser.parse_args()    

    if args.directions is None:
        raw_files = glob.glob(f'{args.from_folder}/train*')

        directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
    else:
        directions = args.directions.split(',')
    directions = sorted(set(directions))
    
    for direction in directions:
        src, tgt = direction.split('-')
        src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}'
        tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}'
        src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}'
        tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}'
        assert src_file != src_file_out
        assert tgt_file != tgt_file_out
        print(f'deduping {src_file}, {tgt_file}')
        deup(src_file, tgt_file, src_file_out, tgt_file_out)
                

if __name__ == "__main__":
    main()