File size: 3,695 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


import os
import glob
import argparse
from utils.dedup import deup
import sys

WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)

if WORKDIR_ROOT is None or  not WORKDIR_ROOT.strip():
    print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
    sys.exit(-1)

def get_directions(folder):
    raw_files = glob.glob(f'{folder}/train*')
    directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] 
    return directions   

def diff_list(lhs, rhs):
    return set(lhs).difference(set(rhs))

def check_diff(
    from_src_file, from_tgt_file, 
    to_src_file, to_tgt_file, 
):
    seen_in_from = set()
    seen_src_in_from = set()
    seen_tgt_in_from = set()
    from_count = 0
    with open(from_src_file, encoding='utf-8') as fsrc, \
        open(from_tgt_file, encoding='utf-8') as ftgt:
        for s, t in zip(fsrc, ftgt):
            seen_in_from.add((s, t))
            seen_src_in_from.add(s)
            seen_tgt_in_from.add(t)
            from_count += 1
    common = 0
    common_src = 0
    common_tgt = 0
    to_count = 0
    seen = set()

    with open(to_src_file, encoding='utf-8') as fsrc, \
        open(to_tgt_file, encoding='utf-8') as ftgt:
        for s, t in zip(fsrc, ftgt):
            to_count += 1
            if (s, t) not in seen:
                if (s, t) in seen_in_from:
                    common += 1
                if s in seen_src_in_from:
                    common_src += 1
                    seen_src_in_from.remove(s)
                if t in seen_tgt_in_from:
                    common_tgt += 1
                    seen_tgt_in_from.remove(t)
                seen.add((s, t))
    return common, common_src, common_tgt, from_count, to_count

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--folder", type=str, required=True,
                        help="the data folder ")
    parser.add_argument("--split", type=str, default='test',
                        help="split (valid, test) to check against training data")
    parser.add_argument('--directions', type=str, default=None, required=False)

    args = parser.parse_args()    

    if args.directions is None:
        directions = set(get_directions(args.folder))
        directions = sorted(directions)
    else:
        directions = args.directions.split(',')
    directions = sorted(set(directions))

    results = []
    print(f'checking where {args.split} split data are in training')
    print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')

    for direction in directions:
        src, tgt = direction.split('-')
        from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}'
        from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}'
        if not os.path.exists(from_src_file):
            # some test/valid data might in reverse directinos:
            from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}'
            from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}'            
        to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}'
        to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}'
        if not os.path.exists(to_src_file) or not os.path.exists(from_src_file):
            continue
        r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file)
        results.append(r)
        print(f'{direction}\t', '\t'.join(map(str, r)))
                

if __name__ == "__main__":
    main()