JustinLin610's picture
first commit
ee21b96
raw
history blame
No virus
2 kB
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import fasttext as ft
import os
import regex
import sys
def get_parser():
parser = argparse.ArgumentParser(
description="reads text from stdin and outputs normalized, lid-filtered version to stdout"
)
parser.add_argument(
"--fasttext-model",
help="path to fasttext model",
default="lid.187.bin",
)
parser.add_argument("--lang", help="language id", required=True)
parser.add_argument(
"--lid-threshold",
type=float,
help="threshold for this lang id probability",
default=0.4,
)
return parser
def main():
parser = get_parser()
args = parser.parse_args()
filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")
lg = args.lang.lower()
lg_label = f"__label__{lg}"
thresh = args.lid_threshold
if os.path.exists(args.fasttext_model):
model = ft.load_model(args.fasttext_model)
else:
print(
f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. "
f"To enable language filtering, please download the latest language id model "
f"from https://fasttext.cc/docs/en/language-identification.html",
file=sys.stderr,
)
model = None
for line in sys.stdin:
line = line.strip()
line = filter_r.sub(" ", line)
line = " ".join(line.split())
if model is not None:
lid, prob = model.predict(line, k=100)
try:
target_idx = lid.index(lg_label)
except ValueError:
continue
if target_idx == 0 or prob[target_idx] >= thresh:
print(line)
else:
print(line)
if __name__ == "__main__":
main()