"""Separate text to zh en lists.""" # pylint: disable= # from typing import Tuple, from typing import Iterable, List, Optional, Tuple, Union # noqa import numpy as np # from fastlid import fastlid from polyglot.text import Detector from logzero import logger from radiobee.lists2cmat import lists2cmat from radiobee.detect import detect def text2lists( text: Union[Iterable[str], str], set_languages: Optional[List[str]] = None, ) -> Tuple[List[str], List[str]]: """Separate text to zh en lists. Args: text: mixed text set_languages: no default (open-end) use polyglot.text.Detector to pick two languages Attributes: cmat: correlation matrix (len(list_l) x len(list_r)) before adjusting (shifting) offset: plus, [""] * offset + list2 minus, [""] * (-offset) + list1 Returns: two lists, best effort alignment """ if not isinstance(text, str) and isinstance(text, Iterable): try: text = "\n".join(text) except Exception as e: logger.error(e) raise # set_languages default to ["en", "zh"] if set_languages is None: lang12 = [elm.code for elm in Detector(text).languages] # set_languages = ["en", "zh"] # set 'un' to 'en' # set_languages = ['en' if elm in ['un'] else elm for elm in lang12[:2]] set_languages = [] for elm in lang12[:2]: if elm in ["un"]: logger.warning(" Unknown language, set to en") set_languages.append("en") else: set_languages.append(elm) # fastlid.set_languages = set_languages list1 = [] list2 = [] # lang0, _ = fastlid(text[:15000]) lang0 = detect(text, set_languages) res = [] left = True # start with left list1 for elm in [_ for _ in text.splitlines() if _.strip()]: # lang, _ = fastlid(elm) lang = detect(elm, set_languages) if lang == lang0: res.append(elm) else: if left: # list1.append("\n".join(res)) list1.extend(res) else: # list2.append("\n".join(res)) list2.extend(res) left = not left res = [elm] lang0 = lang # process the last if left: list1.extend(res) else: list2.extend(res) try: # lang1, _ = fastlid(' '.join(list1)) lang1 = detect(" ".join(list1), set_languages) except Exception as exc: logger.error(exc) lang1 = "en" try: # lang2, _ = fastlid(' '.join(list2)) lang2 = detect(" ".join(list2), set_languages) except Exception as exc: logger.error(exc) lang2 = "en" # find offset via diagonal(k), len1, len2 = len(list1), len(list2) # len2, len1 = cmat.shape # len_r, len_c = cmat.shape # ylim, xlim = cmat.shape ylim, xlim = len2, len1 # check # cmat dim: len1 x len2 or ylim x xlim cmat = lists2cmat(list1, list2, lang1, lang2) # sq_mean_pair = [(elm, np.square(cmat.diagonal(elm)).mean()) for elm in range(2 - ylim, xlim + 1)] # df = pd.DataFrame(sq_mean_pair, columns=['offset', 'sq_mean']) # df.plot.scatter('offset', 'sq_mean') # optimum_offset = df.offset[df.sq_mean.argmax()] # equiv to np.argmax(sq_mean) - (ylim - 2) # locate max, -ylim + 2 ...xlim: range(1 - ylim, xlim) # sqare sum sq_mean = [np.square(cmat.diagonal(elm)).mean() for elm in range(1 - ylim, xlim - 1)] # tot: xlim + ylim - 1 # temp = [np.square(cmat.diagonal(elm)) for elm in range(2 - ylim, xlim + 1)] # sq_mean = [elm.mean() if np.any(elm) else 0.0 for elm in temp] # plt.figure() # plt.scatter(range(1 - ylim, xlim), sq_mean) offset = np.argmax(sq_mean) - (ylim - 1) text2lists.cmat = cmat text2lists.offset = offset text2lists.lang1 = lang1 text2lists.lang2 = lang2 # shift list1 if offsset >= 0, else shift list2 if offset > -1: # list1a = list1[:] # list2a = [""] * offset + list2 list2 = [""] * offset + list2 else: list1 = [""] * (-offset) + list1 # list1a = [""] * (-offset) + list1 # list2a = list2[:] return list1, list2