#!/usr/bin/python3 # -*- coding: utf-8 -*- """ https://huggingface.co/spaces/sayakpaul/demo-docker-gradio """ import argparse import json import platform from allennlp.models.archival import archive_model, load_archive from allennlp.predictors.text_classifier import TextClassifierPredictor import fasttext from fasttext.FastText import load_model, _FastText import gradio as gr from gradio import inputs, outputs from langid.langid import LanguageIdentifier, model from project_settings import project_path, temp_directory from toolbox.os.command import Command def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--language_identification_md_file", default=(project_path / "language_identification.md").as_posix(), type=str ) parser.add_argument( "--lang_id_examples_file", default=(project_path / "lang_id_examples.json").as_posix(), type=str ) parser.add_argument( "--fasttext_model", default=(project_path / "pretrained_models/lid.176.bin").as_posix(), type=str ) args = parser.parse_args() return args lang_id_identifier: LanguageIdentifier = None fasttext_model: _FastText = None qgyd_lang_id_predictor: TextClassifierPredictor = None trained_model_dir = project_path / "trained_models/huggingface" trained_model_dir.mkdir(parents=True, exist_ok=True) def init_qgyd_lang_id_predictor() -> TextClassifierPredictor: model_name = "qgyd2021/language_identification" model_path = trained_model_dir / model_name if not model_path.exists(): model_path.parent.mkdir(exist_ok=True) Command.cd(model_path.parent.as_posix()) Command.popen("git clone https://huggingface.co/{}".format(model_name)) archive = load_archive(archive_file=model_path.as_posix()) predictor = TextClassifierPredictor( model=archive.model, dataset_reader=archive.dataset_reader, ) return predictor def click_lang_id_button(text: str, ground_true: str, model_name: str): global lang_id_identifier global fasttext_model global qgyd_lang_id_predictor text = str(text).strip() if model_name == "langid": label, prob = lang_id_identifier.classify(text) elif model_name == "fasttext": label, prob = fasttext_model.predict(text, k=1) label = label[0][9:] prob = prob[0] elif model_name == "qgyd_lang_id_1": json_dict = { "sentence": text } outputs = qgyd_lang_id_predictor.predict_json( json_dict ) label = outputs["label"] probs = outputs["probs"] prob = max(probs) else: label = "model_name not available." prob = -1 return label, str(round(prob, 4)) def main(): args = get_args() brief_description = """ Language Identification """ # description with open(args.language_identification_md_file, "r", encoding="utf-8") as f: description = f.read() # examples with open(args.lang_id_examples_file, "r", encoding="utf-8") as f: lang_id_examples = json.load(f) global lang_id_identifier global fasttext_model global qgyd_lang_id_predictor lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) fasttext_model = fasttext.load_model(args.fasttext_model) qgyd_lang_id_predictor = init_qgyd_lang_id_predictor() blocks = gr.Interface( click_lang_id_button, inputs=[ inputs.Textbox(lines=3, label="text"), inputs.Textbox(label="ground_true"), inputs.Dropdown(choices=["langid", "fasttext", "qgyd_lang_id_1"], default="langid", label="model_name"), ], outputs=[ outputs.Textbox(label="label"), outputs.Textbox(label="prob"), ], examples=lang_id_examples, description=brief_description, title="Language Identification", server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0", server_port=7860 ) blocks.launch( share=False if platform.system() == "Windows" else False, ) return if __name__ == "__main__": main()