Spaces:
Build error
Build error
File size: 4,438 Bytes
6cbcdf3 4c04f50 6cbcdf3 7dce6dc 7fd4e54 6cbcdf3 7dce6dc 6cbcdf3 7dce6dc 7fd4e54 7dce6dc 6cbcdf3 7dce6dc 7fd4e54 7dce6dc 6cbcdf3 7dce6dc 7fd4e54 7dce6dc 6cbcdf3 7dce6dc 6cbcdf3 7dce6dc 6cbcdf3 5dff3c8 6cbcdf3 5dff3c8 7dce6dc 6cbcdf3 e50699c 7dce6dc 6cbcdf3 7dce6dc 6cbcdf3 7dce6dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
"""Separate text to zh en lists."""
# pylint: disable=unused-import, too-many-locals, invalid-name, too-many-branches, too-many-statements,
# from typing import Tuple,
from typing import Iterable, List, Optional, Tuple, Union # noqa
import numpy as np
# from fastlid import fastlid
from polyglot.text import Detector
from logzero import logger
from radiobee.lists2cmat import lists2cmat
from radiobee.detect import detect
def text2lists(
text: Union[Iterable[str], str],
set_languages: Optional[List[str]] = None,
) -> Tuple[List[str], List[str]]:
"""Separate text to zh en lists.
Args:
text: mixed text
set_languages: no default (open-end)
use polyglot.text.Detector to pick two languages
Attributes:
cmat: correlation matrix (len(list_l) x len(list_r))
before adjusting (shifting)
offset: plus, [""] * offset + list2
minus, [""] * (-offset) + list1
Returns:
two lists, best effort alignment
"""
if not isinstance(text, str) and isinstance(text, Iterable):
try:
text = "\n".join(text)
except Exception as e:
logger.error(e)
raise
# set_languages default to ["en", "zh"]
if set_languages is None:
lang12 = [elm.code for elm in Detector(text).languages]
# set_languages = ["en", "zh"]
# set 'un' to 'en'
# set_languages = ['en' if elm in ['un'] else elm for elm in lang12[:2]]
set_languages = []
for elm in lang12[:2]:
if elm in ["un"]:
logger.warning(" Unknown language, set to en")
set_languages.append("en")
else:
set_languages.append(elm)
# fastlid.set_languages = set_languages
list1 = []
list2 = []
# lang0, _ = fastlid(text[:15000])
lang0 = detect(text, set_languages)
res = []
left = True # start with left list1
for elm in [_ for _ in text.splitlines() if _.strip()]:
# lang, _ = fastlid(elm)
lang = detect(elm, set_languages)
if lang == lang0:
res.append(elm)
else:
if left:
# list1.append("\n".join(res))
list1.extend(res)
else:
# list2.append("\n".join(res))
list2.extend(res)
left = not left
res = [elm]
lang0 = lang
# process the last
if left:
list1.extend(res)
else:
list2.extend(res)
try:
# lang1, _ = fastlid(' '.join(list1))
lang1 = detect(" ".join(list1), set_languages)
except Exception as exc:
logger.error(exc)
lang1 = "en"
try:
# lang2, _ = fastlid(' '.join(list2))
lang2 = detect(" ".join(list2), set_languages)
except Exception as exc:
logger.error(exc)
lang2 = "en"
# find offset via diagonal(k),
len1, len2 = len(list1), len(list2)
# len2, len1 = cmat.shape
# len_r, len_c = cmat.shape
# ylim, xlim = cmat.shape
ylim, xlim = len2, len1 # check
# cmat dim: len1 x len2 or ylim x xlim
cmat = lists2cmat(list1, list2, lang1, lang2)
# sq_mean_pair = [(elm, np.square(cmat.diagonal(elm)).mean()) for elm in range(2 - ylim, xlim + 1)]
# df = pd.DataFrame(sq_mean_pair, columns=['offset', 'sq_mean'])
# df.plot.scatter('offset', 'sq_mean')
# optimum_offset = df.offset[df.sq_mean.argmax()]
# equiv to np.argmax(sq_mean) - (ylim - 2)
# locate max, -ylim + 2 ...xlim: range(1 - ylim, xlim)
# sqare sum
sq_mean = [np.square(cmat.diagonal(elm)).mean() for elm in range(1 - ylim, xlim - 1)]
# tot: xlim + ylim - 1
# temp = [np.square(cmat.diagonal(elm)) for elm in range(2 - ylim, xlim + 1)]
# sq_mean = [elm.mean() if np.any(elm) else 0.0 for elm in temp]
# plt.figure()
# plt.scatter(range(1 - ylim, xlim), sq_mean)
offset = np.argmax(sq_mean) - (ylim - 1)
text2lists.cmat = cmat
text2lists.offset = offset
text2lists.lang1 = lang1
text2lists.lang2 = lang2
# shift list1 if offsset >= 0, else shift list2
if offset > -1:
# list1a = list1[:]
# list2a = [""] * offset + list2
list2 = [""] * offset + list2
else:
list1 = [""] * (-offset) + list1
# list1a = [""] * (-offset) + list1
# list2a = list2[:]
return list1, list2
|