OFA-OCR / fairseq /examples /backtranslation /deduplicate_lines.py
JustinLin610's picture
first commit
ee21b96
raw
history blame
1.22 kB
#!/usr/bin/python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import fileinput
import hashlib
import sys
from multiprocessing import Pool
def get_hashes_and_lines(raw_line):
hash = hashlib.md5(raw_line).hexdigest()
return hash, raw_line
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--workers", type=int, default=10)
parser.add_argument("files", nargs="*", help="input files")
args = parser.parse_args()
seen = set()
with fileinput.input(args.files, mode="rb") as h:
pool = Pool(args.workers)
results = pool.imap_unordered(get_hashes_and_lines, h, 1000)
for i, (hash, raw_line) in enumerate(results):
if hash not in seen:
seen.add(hash)
sys.stdout.buffer.write(raw_line)
if i % 1000000 == 0:
print(i, file=sys.stderr, end="", flush=True)
elif i % 100000 == 0:
print(".", file=sys.stderr, end="", flush=True)
print(file=sys.stderr, flush=True)
if __name__ == "__main__":
main()