Spaces:
Runtime error
Runtime error
OFA-OCR-dedao-demo001
/
fairseq
/examples
/wav2vec
/unsupervised
/scripts
/normalize_and_filter_text.py
#!/usr/bin/env python3 | |
# Copyright (c) Facebook, Inc. and its affiliates. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
import argparse | |
import fasttext as ft | |
import os | |
import regex | |
import sys | |
def get_parser(): | |
parser = argparse.ArgumentParser( | |
description="reads text from stdin and outputs normalized, lid-filtered version to stdout" | |
) | |
parser.add_argument( | |
"--fasttext-model", | |
help="path to fasttext model", | |
default="lid.187.bin", | |
) | |
parser.add_argument("--lang", help="language id", required=True) | |
parser.add_argument( | |
"--lid-threshold", | |
type=float, | |
help="threshold for this lang id probability", | |
default=0.4, | |
) | |
return parser | |
def main(): | |
parser = get_parser() | |
args = parser.parse_args() | |
filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]") | |
lg = args.lang.lower() | |
lg_label = f"__label__{lg}" | |
thresh = args.lid_threshold | |
if os.path.exists(args.fasttext_model): | |
model = ft.load_model(args.fasttext_model) | |
else: | |
print( | |
f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. " | |
f"To enable language filtering, please download the latest language id model " | |
f"from https://fasttext.cc/docs/en/language-identification.html", | |
file=sys.stderr, | |
) | |
model = None | |
for line in sys.stdin: | |
line = line.strip() | |
line = filter_r.sub(" ", line) | |
line = " ".join(line.split()) | |
if model is not None: | |
lid, prob = model.predict(line, k=100) | |
try: | |
target_idx = lid.index(lg_label) | |
except ValueError: | |
continue | |
if target_idx == 0 or prob[target_idx] >= thresh: | |
print(line) | |
else: | |
print(line) | |
if __name__ == "__main__": | |
main() | |