File size: 2,253 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os

from fairseq import options

from examples.noisychannel import rerank_options, rerank_utils


def score_lm(args):
    using_nbest = args.nbest_list is not None
    (
        pre_gen,
        left_to_right_preprocessed_dir,
        right_to_left_preprocessed_dir,
        backwards_preprocessed_dir,
        lm_preprocessed_dir,
    ) = rerank_utils.get_directories(
        args.data_dir_name,
        args.num_rescore,
        args.gen_subset,
        args.gen_model_name,
        args.shard_id,
        args.num_shards,
        args.sampling,
        args.prefix_len,
        args.target_prefix_frac,
        args.source_prefix_frac,
    )

    predictions_bpe_file = pre_gen + "/generate_output_bpe.txt"
    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    gen_output = rerank_utils.BitextOutputFromGen(
        predictions_bpe_file, bpe_symbol=args.post_process, nbest=using_nbest
    )

    if args.language_model is not None:
        lm_score_file = rerank_utils.rescore_file_name(
            pre_gen, args.prefix_len, args.lm_name, lm_file=True
        )

    if args.language_model is not None and not os.path.isfile(lm_score_file):
        print("STEP 4.5: language modeling for P(T)")
        if args.lm_bpe_code is None:
            bpe_status = "no bpe"
        elif args.lm_bpe_code == "shared":
            bpe_status = "shared"
        else:
            bpe_status = "different"

        rerank_utils.lm_scoring(
            lm_preprocessed_dir,
            bpe_status,
            gen_output,
            pre_gen,
            args.lm_dict,
            args.lm_name,
            args.language_model,
            args.lm_bpe_code,
            128,
            lm_score_file,
            args.target_lang,
            args.source_lang,
            prefix_len=args.prefix_len,
        )


def cli_main():
    parser = rerank_options.get_reranking_parser()
    args = options.parse_args_and_arch(parser)
    score_lm(args)


if __name__ == "__main__":
    cli_main()