File size: 844 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python3
#
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import sys

import sacremoses


def main(args):
    """Tokenizes, preserving tabs"""
    mt = sacremoses.MosesTokenizer(lang=args.lang)

    def tok(s):
        return mt.tokenize(s, return_str=True)

    for line in sys.stdin:
        parts = list(map(tok, line.split("\t")))
        print(*parts, sep="\t", flush=True)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--lang", "-l", default="en")
    parser.add_argument("--penn", "-p", action="store_true")
    parser.add_argument("--fields", "-f", help="fields to tokenize")
    args = parser.parse_args()

    main(args)