File size: 7,993 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


import argparse
import os
import os.path as op
from collections import namedtuple
from multiprocessing import cpu_count
from typing import List, Optional

import sentencepiece as sp
from fairseq.data.encoders.byte_bpe import ByteBPE
from fairseq.data.encoders.byte_utils import byte_encode
from fairseq.data.encoders.bytes import Bytes
from fairseq.data.encoders.characters import Characters
from fairseq.data.encoders.moses_tokenizer import MosesTokenizer
from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE


SPLITS = ["train", "valid", "test"]


def _convert_xml(in_path: str, out_path: str):
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            ss = s.strip()
            if not ss.startswith("<seg"):
                continue
            ss = ss.replace("</seg>", "").split('">')
            assert len(ss) == 2
            f_o.write(ss[1].strip() + "\n")


def _convert_train(in_path: str, out_path: str):
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            ss = s.strip()
            if ss.startswith("<"):
                continue
            f_o.write(ss.strip() + "\n")


def _get_bytes(in_path: str, out_path: str):
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            f_o.write(Bytes.encode(s.strip()) + "\n")


def _get_chars(in_path: str, out_path: str):
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            f_o.write(Characters.encode(s.strip()) + "\n")


def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
    Args = namedtuple(
        "Args",
        [
            "moses_source_lang",
            "moses_target_lang",
            "moses_no_dash_splits",
            "moses_no_escape",
        ],
    )
    args = Args(
        moses_source_lang=src,
        moses_target_lang=tgt,
        moses_no_dash_splits=False,
        moses_no_escape=False,
    )
    pretokenizer = MosesTokenizer(args)
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            f_o.write(pretokenizer.encode(s.strip()) + "\n")


def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str):
    with open(out_path, "w") as f_o:
        for lang in [src, tgt]:
            with open(f"{in_path_prefix}.{lang}") as f:
                for s in f:
                    f_o.write(byte_encode(s.strip()) + "\n")


def _get_bpe(in_path: str, model_prefix: str, vocab_size: int):
    arguments = [
        f"--input={in_path}",
        f"--model_prefix={model_prefix}",
        f"--model_type=bpe",
        f"--vocab_size={vocab_size}",
        "--character_coverage=1.0",
        "--normalization_rule_name=identity",
        f"--num_threads={cpu_count()}",
    ]
    sp.SentencePieceTrainer.Train(" ".join(arguments))


def _apply_bbpe(model_path: str, in_path: str, out_path: str):
    Args = namedtuple("Args", ["sentencepiece_model_path"])
    args = Args(sentencepiece_model_path=model_path)
    tokenizer = ByteBPE(args)
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            f_o.write(tokenizer.encode(s.strip()) + "\n")


def _apply_bpe(model_path: str, in_path: str, out_path: str):
    Args = namedtuple("Args", ["sentencepiece_model"])
    args = Args(sentencepiece_model=model_path)
    tokenizer = SentencepieceBPE(args)
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            f_o.write(tokenizer.encode(s.strip()) + "\n")


def _concat_files(in_paths: List[str], out_path: str):
    with open(out_path, "w") as f_o:
        for p in in_paths:
            with open(p) as f:
                for r in f:
                    f_o.write(r)


def preprocess_iwslt17(
    root: str,
    src: str,
    tgt: str,
    bpe_size: Optional[int],
    need_chars: bool,
    bbpe_size: Optional[int],
    need_bytes: bool,
):
    # extract bitext
    in_root = op.join(root, f"{src}-{tgt}")
    for lang in [src, tgt]:
        _convert_train(
            op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"),
            op.join(root, f"train.{lang}"),
        )
        _convert_xml(
            op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"),
            op.join(root, f"valid.{lang}"),
        )
        _convert_xml(
            op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"),
            op.join(root, f"test.{lang}"),
        )
    # pre-tokenize
    for lang in [src, tgt]:
        for split in SPLITS:
            pretokenize(
                op.join(root, f"{split}.{lang}"),
                op.join(root, f"{split}.moses.{lang}"),
                src,
                tgt,
            )
    # tokenize with BPE vocabulary
    if bpe_size is not None:
        # learn vocabulary
        concated_train_path = op.join(root, "train.all")
        _concat_files(
            [op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")],
            concated_train_path,
        )
        bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}")
        _get_bpe(concated_train_path, bpe_model_prefix, bpe_size)
        os.remove(concated_train_path)
        # apply
        for lang in [src, tgt]:
            for split in SPLITS:
                _apply_bpe(
                    bpe_model_prefix + ".model",
                    op.join(root, f"{split}.moses.{lang}"),
                    op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"),
                )
    # tokenize with bytes vocabulary
    if need_bytes:
        for lang in [src, tgt]:
            for split in SPLITS:
                _get_bytes(
                    op.join(root, f"{split}.moses.{lang}"),
                    op.join(root, f"{split}.moses.bytes.{lang}"),
                )
    # tokenize with characters vocabulary
    if need_chars:
        for lang in [src, tgt]:
            for split in SPLITS:
                _get_chars(
                    op.join(root, f"{split}.moses.{lang}"),
                    op.join(root, f"{split}.moses.chars.{lang}"),
                )
    # tokenize with byte-level BPE vocabulary
    if bbpe_size is not None:
        # learn vocabulary
        bchar_path = op.join(root, "train.bchar")
        _convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path)
        bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}")
        _get_bpe(bchar_path, bbpe_model_prefix, bbpe_size)
        os.remove(bchar_path)
        # apply
        for lang in [src, tgt]:
            for split in SPLITS:
                _apply_bbpe(
                    bbpe_model_prefix + ".model",
                    op.join(root, f"{split}.moses.{lang}"),
                    op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"),
                )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--root", type=str, default="data")
    parser.add_argument(
        "--bpe-vocab",
        default=None,
        type=int,
        help="Generate tokenized bitext with BPE of size K."
        "Default to None (disabled).",
    )
    parser.add_argument(
        "--bbpe-vocab",
        default=None,
        type=int,
        help="Generate tokenized bitext with BBPE of size K."
        "Default to None (disabled).",
    )
    parser.add_argument(
        "--byte-vocab",
        action="store_true",
        help="Generate tokenized bitext with bytes vocabulary",
    )
    parser.add_argument(
        "--char-vocab",
        action="store_true",
        help="Generate tokenized bitext with chars vocabulary",
    )
    args = parser.parse_args()

    preprocess_iwslt17(
        args.root,
        "fr",
        "en",
        args.bpe_vocab,
        args.char_vocab,
        args.bbpe_vocab,
        args.byte_vocab,
    )


if __name__ == "__main__":
    main()