# -*- coding: utf-8 -*- r""" Corpora ============== Available corpora to train/test Polos models. """ import os import click from torchnlp.download import download_file_maybe_extract corpus2download = { "apequest": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/apequest.zip", "qt21": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/hter/qt21.zip", "wmt-metrics": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/wmt-metrics.zip", "doc-wmt19": "https://unbabel-experimental-data-sets.s3-eu-west-1.amazonaws.com/polos/da/doc-wmt19.zip", } def download_corpus(corpus: str, saving_directory: str = None) -> None: """Function that downloads a corpus from AWS. :param corpus: Name of the corpus to be loaded. :param saving_directory: RELATIVE path to the saving folder. """ corpus = corpus.lower() if not saving_directory: saving_directory = "data/" if not os.path.exists(saving_directory): os.makedirs(saving_directory) if os.path.isdir(saving_directory + corpus): click.secho(f"{corpus} is already in cache.", fg="yellow") return elif corpus in corpus2download: download_file_maybe_extract( corpus2download[corpus], directory=saving_directory, ) else: raise Exception(f"{corpus} is not a valid corpus!") click.secho("Download succeeded.", fg="yellow") if os.path.exists(saving_directory + corpus + ".zip"): os.remove(saving_directory + corpus + ".zip") elif os.path.exists(saving_directory + corpus + ".tar.gz"): os.remove(saving_directory + corpus + ".tar.gz") else: click.secho("Fail to delete compressed file.", fg="red")