Spaces:
Runtime error
Runtime error
import gzip | |
import argparse | |
from string import punctuation | |
def len_no_punc(s, punc): | |
return len([ch for ch in s if ch in punc]) | |
def filter_overpunc(len_npunc, len_sen): | |
return len_npunc < 0.5*len_sen | |
def main(args): | |
punc = punctuation + "—|–" | |
print('Processing file {}'.format(args.input)) | |
with gzip.open(args.input, 'rt', encoding=args.encoding) as tsv: | |
with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc: | |
with open(args.bitext + '.' + args.tgt_lang, 'wt', encoding=args.encoding) as ftgt: | |
line = tsv.readline() | |
fields = line.split('\t') | |
src, tgt = fields[1], fields[2] | |
nchar_npunc_src = len_no_punc(src, punc) | |
nchar_npunc_tgt = len_no_punc(tgt, punc) | |
if filter_overpunc(nchar_npunc_src, len(src)) and filter_overpunc(nchar_npunc_tgt, len(tgt)): | |
fsrc.write(src.strip() + '\n') | |
ftgt.write(tgt.strip() + '\n') | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--input", required=True, type=str) | |
parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output') | |
parser.add_argument('--bitext', type=str, required=True, help='language direction') | |
parser.add_argument('--src-lang', type=str, required=True, help='Source language') | |
parser.add_argument('--tgt-lang', type=str, required=True, help='Target language') | |
main(parser.parse_args()) | |