OFA-OCR-dedao-demo001

Runtime error

App Files Files Community

OFA-OCR-dedao-demo001 / fairseq /examples /wav2vec /unsupervised /scripts /normalize_and_filter_text.py

JustinLin610

first commit

ee21b96 almost 2 years ago

raw

history blame

No virus

2 kB

	#!/usr/bin/env python3
	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import argparse
	import fasttext as ft
	import os
	import regex
	import sys


	def get_parser():
	parser = argparse.ArgumentParser(
	description="reads text from stdin and outputs normalized, lid-filtered version to stdout"
	)
	parser.add_argument(
	"--fasttext-model",
	help="path to fasttext model",
	default="lid.187.bin",
	)
	parser.add_argument("--lang", help="language id", required=True)
	parser.add_argument(
	"--lid-threshold",
	type=float,
	help="threshold for this lang id probability",
	default=0.4,
	)

	return parser


	def main():
	parser = get_parser()
	args = parser.parse_args()
	filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]")

	lg = args.lang.lower()
	lg_label = f"__label__{lg}"
	thresh = args.lid_threshold

	if os.path.exists(args.fasttext_model):
	model = ft.load_model(args.fasttext_model)
	else:
	print(
	f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. "
	f"To enable language filtering, please download the latest language id model "
	f"from https://fasttext.cc/docs/en/language-identification.html",
	file=sys.stderr,
	)
	model = None

	for line in sys.stdin:
	line = line.strip()
	line = filter_r.sub(" ", line)
	line = " ".join(line.split())

	if model is not None:
	lid, prob = model.predict(line, k=100)
	try:
	target_idx = lid.index(lg_label)
	except ValueError:
	continue
	if target_idx == 0 or prob[target_idx] >= thresh:
	print(line)
	else:
	print(line)


	if __name__ == "__main__":
	main()