freemt
Update slow-track for more lang pairs
7fd4e54
raw
history blame
2.27 kB
"""Detect language via polyglot and fastlid."""
# pylint: disable=
from typing import Any, Callable, List, Optional
from polyglot.text import Detector
import polyglot.detect.base
from polyglot.detect.base import UnknownLanguage
from fastlid import fastlid
from logzero import logger
polyglot.detect.base.logger.setLevel("ERROR")
def with_func_attrs(**attrs: Any) -> Callable:
"""Define func_attrs."""
def with_attrs(fct: Callable) -> Callable:
for key, val in attrs.items():
setattr(fct, key, val)
return fct
return with_attrs
# @with_func_attrs(set_languages=None)
# def detect(text: str) -> str:
def detect(text: str, set_languages: Optional[List[str]] = None) -> str:
"""Detect language via polyglot and fastlid.
check first with fastlid, if conf < 0.3, check with
Alternative in detec_alt.py
"""
# if not text.strip(): return "en"
fastlid.set_languages = set_languages
lang, conf = fastlid(text)
detect.lang_conf = lang, conf
if conf >= 0.3 or lang in ["zh"]:
return lang
try:
langs = [(elm.code[:2], elm.confidence) for elm in Detector(text).languages]
detect.lang_conf = langs
# lang, conf = _[0]
except UnknownLanguage:
if set_languages is None:
def_lang = "en"
else:
# def_lang = set_languages[-1]
def_lang = set_languages[0]
logger.warning(" UnknownLanguage exception: probably snippet too short, setting to %s", def_lang)
langs = [(def_lang, 0)]
except Exception as exc:
logger.error(exc)
langs = [("en", 0)]
del conf
# return first enrty's lang
if set_languages is None:
def_lang = langs[0][0]
else:
def_lang = "en"
# pick the first in Detector(text).languages
# just to silence pyright
# set_languages_: List[str] = [""] if set_languages is None else set_languages
for elm in langs:
if elm[0] in set_languages: # type: ignore
def_lang = elm[0]
break
# set_languages is set
if not isinstance(set_languages, (list, tuple)):
logger.warning("set_languages (%s) ought to be a list/tuple")
return def_lang