Spaces:
Build error
Build error
freemt
commited on
Commit
•
4c04f50
1
Parent(s):
02e4e96
Update before sent-align
Browse files- radiobee/__main__.py +2 -4
- radiobee/align_sents.py +4 -2
- radiobee/align_texts.py +2 -0
- radiobee/amend_avec.py +1 -1
- radiobee/app.py +2 -0
- radiobee/cmat2tset.py +2 -0
- radiobee/docterm_scores.py +7 -5
- radiobee/en2zh.py +6 -1
- radiobee/error_msg.py +2 -0
- radiobee/files2df.py +2 -0
- radiobee/gen_aset.py +2 -0
- radiobee/gen_eps_minsamples.py +7 -4
- radiobee/gen_model.py +5 -6
- radiobee/gen_pset.py +7 -2
- radiobee/gen_row_alignment.py +1 -1
- radiobee/gen_vector.py +2 -2
- radiobee/gradiobee.py +7 -9
- radiobee/interpolate_pset.py +1 -0
- radiobee/lists2cmat.py +22 -1
- radiobee/loadtext.py +13 -14
- radiobee/mdx_e2c.py +1 -0
- radiobee/model_s.py +37 -0
- radiobee/paras2sents.py +110 -0
- radiobee/plot_cmat.py +4 -3
- radiobee/plot_df.py +2 -1
- radiobee/process_upload.py +3 -2
- radiobee/seg_text.py +2 -0
- radiobee/shuffle_sents.py +44 -15
- radiobee/smatrix.py +8 -5
- radiobee/text2lists.py +2 -1
- requirements.txt +3 -1
- run-pydocstle.bat +1 -0
- run-pylint.bat +1 -0
- tests/test_align_sents.py +60 -2
- tests/test_lists2cmat_hlm.py +2 -2
- tests/test_paras2sents.py +34 -0
- tests/test_shuffle_sents.py +136 -0
radiobee/__main__.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
"""Run interactively."""
|
2 |
-
# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
|
3 |
from typing import Any, Tuple, Optional, Union # noqa
|
4 |
|
5 |
import sys
|
6 |
-
from pathlib import Path
|
7 |
import platform
|
8 |
import signal
|
9 |
from random import randint
|
@@ -377,8 +377,6 @@ if __name__ == "__main__":
|
|
377 |
"""
|
378 |
).strip()
|
379 |
|
380 |
-
# "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.11641' target='_blank'>JoJoGAN: One Shot Face Stylization</a>| <a href='https://github.com/mchong6/JoJoGAN' target='_blank'>Github Repo Pytorch</a></p> <center><img src='https://visitor-badge.glitch.me/badge?page_id=akhaliq_jojogan' alt='visitor badge'></center> <p style='text-align: center'>samples from repo: <img src='https://raw.githubusercontent.com/mchong6/JoJoGAN/main/teaser.jpg' alt='animation'/></p>" # noqa
|
381 |
-
|
382 |
article = dedent(
|
383 |
""" <p style="text-align: center">readiobee docs:
|
384 |
<a href="https://radiobee.readthedocs.io/" target="_blank">readthedocs</a>
|
|
|
1 |
"""Run interactively."""
|
2 |
+
# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, unused-import, wrong-import-position, too-many-locals, too-many-statements
|
3 |
from typing import Any, Tuple, Optional, Union # noqa
|
4 |
|
5 |
import sys
|
6 |
+
from pathlib import Path # noqa
|
7 |
import platform
|
8 |
import signal
|
9 |
from random import randint
|
|
|
377 |
"""
|
378 |
).strip()
|
379 |
|
|
|
|
|
380 |
article = dedent(
|
381 |
""" <p style="text-align: center">readiobee docs:
|
382 |
<a href="https://radiobee.readthedocs.io/" target="_blank">readthedocs</a>
|
radiobee/align_sents.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""Align sents via gale-church."""
|
2 |
-
# pylint: disable=
|
3 |
|
4 |
from typing import List, Tuple # noqa
|
5 |
|
@@ -38,8 +38,10 @@ def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
|
|
38 |
|
39 |
texts = []
|
40 |
# for elm in aset:
|
41 |
-
for elm0, elm1 in amended_avec:
|
|
|
42 |
# elm0, elm1, elm2 = elm
|
|
|
43 |
_ = []
|
44 |
|
45 |
# src_text first
|
|
|
1 |
"""Align sents via gale-church."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
|
4 |
from typing import List, Tuple # noqa
|
5 |
|
|
|
38 |
|
39 |
texts = []
|
40 |
# for elm in aset:
|
41 |
+
# for elm0, elm1 in amended_avec:
|
42 |
+
for elm in amended_avec:
|
43 |
# elm0, elm1, elm2 = elm
|
44 |
+
elm0, elm1 = elm[:2]
|
45 |
_ = []
|
46 |
|
47 |
# src_text first
|
radiobee/align_texts.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
"""Align texts based on aset, src_text, tgt_text."""
|
|
|
|
|
2 |
from typing import List, Tuple, Union
|
3 |
from logzero import logger
|
4 |
|
|
|
1 |
"""Align texts based on aset, src_text, tgt_text."""
|
2 |
+
# pylint: disable=unused-variable
|
3 |
+
|
4 |
from typing import List, Tuple, Union
|
5 |
from logzero import logger
|
6 |
|
radiobee/amend_avec.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""Amend avec from align_block."""
|
2 |
-
# pylint: disable=
|
3 |
|
4 |
from typing import List, Tuple, Union
|
5 |
|
|
|
1 |
"""Amend avec from align_block."""
|
2 |
+
# pylint: disable=unused-variable, unused-import
|
3 |
|
4 |
from typing import List, Tuple, Union
|
5 |
|
radiobee/app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
"""Talk to spaces VM via subprocess.check_output."""
|
|
|
|
|
2 |
# import httpx
|
3 |
import subprocess as sp
|
4 |
from shlex import split
|
|
|
1 |
"""Talk to spaces VM via subprocess.check_output."""
|
2 |
+
# pylint: disable=unused-variable, invalid-name
|
3 |
+
|
4 |
# import httpx
|
5 |
import subprocess as sp
|
6 |
from shlex import split
|
radiobee/cmat2tset.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
"""Gen triple-set from a matrix."""
|
|
|
|
|
2 |
from typing import List, Tuple, Union # noqa
|
3 |
|
4 |
import numpy as np
|
|
|
1 |
"""Gen triple-set from a matrix."""
|
2 |
+
# pylint: disable=unused-import
|
3 |
+
|
4 |
from typing import List, Tuple, Union # noqa
|
5 |
|
6 |
import numpy as np
|
radiobee/docterm_scores.py
CHANGED
@@ -2,9 +2,11 @@
|
|
2 |
|
3 |
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
|
4 |
"""
|
|
|
|
|
5 |
from typing import Dict, Iterable, List, Optional, Union # noqa
|
6 |
-
import numpy as np
|
7 |
from itertools import chain
|
|
|
8 |
from psutil import virtual_memory
|
9 |
from more_itertools import ilen
|
10 |
|
@@ -48,8 +50,8 @@ def docterm_scores(
|
|
48 |
for xelm in iter(doc1):
|
49 |
for elm in iter(xelm):
|
50 |
assert isinstance(elm, str)
|
51 |
-
except AssertionError:
|
52 |
-
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ")
|
53 |
except Exception as e:
|
54 |
logger.error(e)
|
55 |
raise
|
@@ -57,8 +59,8 @@ def docterm_scores(
|
|
57 |
for xelm in iter(doc2):
|
58 |
for elm in iter(xelm):
|
59 |
assert isinstance(elm, str)
|
60 |
-
except AssertionError:
|
61 |
-
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ")
|
62 |
except Exception as e:
|
63 |
logger.error(e)
|
64 |
raise
|
|
|
2 |
|
3 |
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
|
4 |
"""
|
5 |
+
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, unused-import
|
6 |
+
|
7 |
from typing import Dict, Iterable, List, Optional, Union # noqa
|
|
|
8 |
from itertools import chain
|
9 |
+
import numpy as np
|
10 |
from psutil import virtual_memory
|
11 |
from more_itertools import ilen
|
12 |
|
|
|
50 |
for xelm in iter(doc1):
|
51 |
for elm in iter(xelm):
|
52 |
assert isinstance(elm, str)
|
53 |
+
except AssertionError as exc:
|
54 |
+
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") from exc
|
55 |
except Exception as e:
|
56 |
logger.error(e)
|
57 |
raise
|
|
|
59 |
for xelm in iter(doc2):
|
60 |
for elm in iter(xelm):
|
61 |
assert isinstance(elm, str)
|
62 |
+
except AssertionError as exc:
|
63 |
+
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") from exc
|
64 |
except Exception as e:
|
65 |
logger.error(e)
|
66 |
raise
|
radiobee/en2zh.py
CHANGED
@@ -5,7 +5,9 @@ from typing import Iterable, List, Union
|
|
5 |
import warnings
|
6 |
|
7 |
import copy
|
8 |
-
|
|
|
|
|
9 |
|
10 |
warnings.simplefilter('ignore', DeprecationWarning)
|
11 |
|
@@ -25,6 +27,9 @@ def en2zh(
|
|
25 |
Returns
|
26 |
res: list of str
|
27 |
"""
|
|
|
|
|
|
|
28 |
res = copy.deepcopy(text)
|
29 |
if isinstance(text, str):
|
30 |
# res = [text.split()]
|
|
|
5 |
import warnings
|
6 |
|
7 |
import copy
|
8 |
+
|
9 |
+
# from radiobee.mdx_e2c import mdx_e2c # moved to local for lazy loading
|
10 |
+
# from lazy import lazy
|
11 |
|
12 |
warnings.simplefilter('ignore', DeprecationWarning)
|
13 |
|
|
|
27 |
Returns
|
28 |
res: list of str
|
29 |
"""
|
30 |
+
# to effect lazy loading
|
31 |
+
from radiobee.mdx_e2c import mdx_e2c # pylint: disable=import-outside-toplevel
|
32 |
+
|
33 |
res = copy.deepcopy(text)
|
34 |
if isinstance(text, str):
|
35 |
# res = [text.split()]
|
radiobee/error_msg.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
"""Prepare an error message for gradiobee."""
|
|
|
|
|
2 |
from typing import Optional, Tuple, Union
|
3 |
import pandas as pd
|
4 |
|
|
|
1 |
"""Prepare an error message for gradiobee."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
+
|
4 |
from typing import Optional, Tuple, Union
|
5 |
import pandas as pd
|
6 |
|
radiobee/files2df.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
"""Convert two iesl to pandas.DataFrame."""
|
|
|
|
|
2 |
from itertools import zip_longest
|
3 |
# import tempfile
|
4 |
import pandas as pd
|
|
|
1 |
"""Convert two iesl to pandas.DataFrame."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
+
|
4 |
from itertools import zip_longest
|
5 |
# import tempfile
|
6 |
import pandas as pd
|
radiobee/gen_aset.py
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
"""Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
|
|
|
|
|
2 |
from typing import List, Tuple, Union
|
3 |
from itertools import zip_longest
|
4 |
|
|
|
1 |
"""Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
|
2 |
+
# pylint: disable=unused-variable
|
3 |
+
|
4 |
from typing import List, Tuple, Union
|
5 |
from itertools import zip_longest
|
6 |
|
radiobee/gen_eps_minsamples.py
CHANGED
@@ -4,10 +4,13 @@
|
|
4 |
def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
|
5 |
"""Gen suggested eps min_samples."""
|
6 |
eps = src_len * 0.01
|
7 |
-
|
8 |
-
|
|
|
9 |
|
10 |
min_samples = tgt_len / 100 * 0.5
|
11 |
-
|
12 |
-
|
|
|
|
|
13 |
return {"eps": eps, "min_samples": min_samples}
|
|
|
4 |
def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
|
5 |
"""Gen suggested eps min_samples."""
|
6 |
eps = src_len * 0.01
|
7 |
+
|
8 |
+
# if eps < 3: eps = 3
|
9 |
+
eps = max(3, eps)
|
10 |
|
11 |
min_samples = tgt_len / 100 * 0.5
|
12 |
+
|
13 |
+
# if min_samples < 3: min_samples = 3
|
14 |
+
min_samples = max(3, min_samples)
|
15 |
+
|
16 |
return {"eps": eps, "min_samples": min_samples}
|
radiobee/gen_model.py
CHANGED
@@ -8,6 +8,8 @@ doc_term_matrix
|
|
8 |
|
9 |
tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
|
10 |
"""
|
|
|
|
|
11 |
from typing import Dict, Iterable, List, Optional, Union # noqa
|
12 |
|
13 |
from textacy.representations import Vectorizer
|
@@ -30,16 +32,13 @@ def gen_model(
|
|
30 |
"""Generate a model (textacy.representations.Vectorizer).
|
31 |
|
32 |
Args:
|
33 |
-
|
34 |
-
|
35 |
(refer to textacy.representation.Vectorizer)
|
36 |
tf_type: Type of term frequency (tf) to use for weights' local component:
|
37 |
-
|
38 |
- "linear": tf (tfs are already linear, so left as-is)
|
39 |
- "sqrt": tf => sqrt(tf)
|
40 |
- "log": tf => log(tf) + 1
|
41 |
- "binary": tf => 1
|
42 |
-
|
43 |
idf_type: Type of inverse document frequency (idf) to use for weights'
|
44 |
global component:
|
45 |
|
@@ -91,8 +90,8 @@ def gen_model(
|
|
91 |
for xelm in iter(tokenized_docs):
|
92 |
for elm in iter(xelm):
|
93 |
assert isinstance(elm, str)
|
94 |
-
except AssertionError:
|
95 |
-
raise AssertionError(" tokenized_docs is not of the typing Iterable[Iterable[str]] ")
|
96 |
except Exception as e:
|
97 |
logger.error(e)
|
98 |
raise
|
|
|
8 |
|
9 |
tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
|
10 |
"""
|
11 |
+
# pylint: disable=too-many-arguments, invalid-name, unused-import
|
12 |
+
|
13 |
from typing import Dict, Iterable, List, Optional, Union # noqa
|
14 |
|
15 |
from textacy.representations import Vectorizer
|
|
|
32 |
"""Generate a model (textacy.representations.Vectorizer).
|
33 |
|
34 |
Args:
|
35 |
+
tokenized_docs: tokenized docs
|
|
|
36 |
(refer to textacy.representation.Vectorizer)
|
37 |
tf_type: Type of term frequency (tf) to use for weights' local component:
|
|
|
38 |
- "linear": tf (tfs are already linear, so left as-is)
|
39 |
- "sqrt": tf => sqrt(tf)
|
40 |
- "log": tf => log(tf) + 1
|
41 |
- "binary": tf => 1
|
|
|
42 |
idf_type: Type of inverse document frequency (idf) to use for weights'
|
43 |
global component:
|
44 |
|
|
|
90 |
for xelm in iter(tokenized_docs):
|
91 |
for elm in iter(xelm):
|
92 |
assert isinstance(elm, str)
|
93 |
+
except AssertionError as e:
|
94 |
+
raise AssertionError(" tokenized_docs is not of the typing Iterable[Iterable[str]] ") from e
|
95 |
except Exception as e:
|
96 |
logger.error(e)
|
97 |
raise
|
radiobee/gen_pset.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2 |
|
3 |
tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
|
4 |
"""
|
|
|
|
|
5 |
from typing import List, Tuple, Union
|
6 |
|
7 |
import numpy as np
|
@@ -22,6 +24,7 @@ def _gen_pset(
|
|
22 |
# ) -> List[Tuple[int, int, Union[float, str]]]:
|
23 |
) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
|
24 |
"""Gen pset from cmat.
|
|
|
25 |
Find pairs for a given cmat.
|
26 |
|
27 |
Args:
|
@@ -86,8 +89,9 @@ def _gen_pset(
|
|
86 |
# low_ = np.min(ymax) - 1 # reset to minimum_value - 1
|
87 |
|
88 |
buff = [(-1, -1, ""), (tgt_len, src_len, "")]
|
89 |
-
|
90 |
-
for idx, tset_elm in enumerate(tset):
|
|
|
91 |
logger.debug("buff: %s", buff)
|
92 |
# postion max in ymax and insert in buff
|
93 |
# if with range given by iset+-delta and
|
@@ -152,6 +156,7 @@ def gen_pset(
|
|
152 |
|
153 |
Refer to _gen_pset.
|
154 |
"""
|
|
|
155 |
gen_pset.min_samples = min_samples
|
156 |
for min_s in range(min_samples):
|
157 |
logger.debug(" min_samples, try %s", min_samples - min_s)
|
|
|
2 |
|
3 |
tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
|
4 |
"""
|
5 |
+
# pylint: disable=too-many-locals, unused-import, invalid-name
|
6 |
+
|
7 |
from typing import List, Tuple, Union
|
8 |
|
9 |
import numpy as np
|
|
|
24 |
# ) -> List[Tuple[int, int, Union[float, str]]]:
|
25 |
) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
|
26 |
"""Gen pset from cmat.
|
27 |
+
|
28 |
Find pairs for a given cmat.
|
29 |
|
30 |
Args:
|
|
|
89 |
# low_ = np.min(ymax) - 1 # reset to minimum_value - 1
|
90 |
|
91 |
buff = [(-1, -1, ""), (tgt_len, src_len, "")]
|
92 |
+
|
93 |
+
# for idx, tset_elm in enumerate(tset):
|
94 |
+
for tset_elm in tset:
|
95 |
logger.debug("buff: %s", buff)
|
96 |
# postion max in ymax and insert in buff
|
97 |
# if with range given by iset+-delta and
|
|
|
156 |
|
157 |
Refer to _gen_pset.
|
158 |
"""
|
159 |
+
del verbose
|
160 |
gen_pset.min_samples = min_samples
|
161 |
for min_s in range(min_samples):
|
162 |
logger.debug(" min_samples, try %s", min_samples - min_s)
|
radiobee/gen_row_alignment.py
CHANGED
@@ -35,7 +35,7 @@ idx += 1; i0, i1, i2 = resu[idx]; '***' if i0 == ''
|
|
35 |
else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
|
36 |
if i2 == '' else i2
|
37 |
"""
|
38 |
-
# pylint: disable=line-too-long
|
39 |
from typing import List, Union
|
40 |
|
41 |
# natural extrapolation with slope equal to 1
|
|
|
35 |
else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
|
36 |
if i2 == '' else i2
|
37 |
"""
|
38 |
+
# pylint: disable=line-too-long, unused-variable
|
39 |
from typing import List, Union
|
40 |
|
41 |
# natural extrapolation with slope equal to 1
|
radiobee/gen_vector.py
CHANGED
@@ -9,11 +9,11 @@ from radiobee.insert_spaces import insert_spaces
|
|
9 |
|
10 |
|
11 |
def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
|
12 |
-
"""Gen vector for a give model.
|
13 |
|
14 |
Args:
|
15 |
text: string of Chinese chars or English words.
|
16 |
-
|
17 |
filename = r"data\test-dual.txt"
|
18 |
text = loadtext(filename)
|
19 |
list1, list2 = zip(*text2lists(text))
|
|
|
9 |
|
10 |
|
11 |
def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
|
12 |
+
r"""Gen vector for a give model.
|
13 |
|
14 |
Args:
|
15 |
text: string of Chinese chars or English words.
|
16 |
+
model: model used
|
17 |
filename = r"data\test-dual.txt"
|
18 |
text = loadtext(filename)
|
19 |
list1, list2 = zip(*text2lists(text))
|
radiobee/gradiobee.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""Gradiobee."""
|
2 |
-
# pylint: disable=invalid-name
|
3 |
from pathlib import Path
|
4 |
import platform
|
5 |
import inspect
|
@@ -12,9 +12,9 @@ from fastlid import fastlid
|
|
12 |
from logzero import logger
|
13 |
from icecream import ic
|
14 |
|
15 |
-
import numpy as np
|
16 |
import pandas as pd
|
17 |
-
import matplotlib
|
18 |
import matplotlib.pyplot as plt
|
19 |
import seaborn as sns
|
20 |
|
@@ -32,11 +32,8 @@ from radiobee.text2lists import text2lists
|
|
32 |
|
33 |
uname = platform.uname()
|
34 |
HFSPACES = False
|
35 |
-
|
36 |
-
if True:
|
37 |
HFSPACES = True
|
38 |
-
from sentence_transformers import SentenceTransformer # noqa
|
39 |
-
model_s = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
|
40 |
|
41 |
sns.set()
|
42 |
sns.set_style("darkgrid")
|
@@ -191,8 +188,8 @@ def gradiobee(
|
|
191 |
|
192 |
logger.debug("lang1: %s, lang2: %s", lang1, lang2)
|
193 |
if debug:
|
194 |
-
|
195 |
-
|
196 |
|
197 |
# fast track
|
198 |
if lang1 in lang_en_zh and lang2 in lang_en_zh:
|
@@ -225,6 +222,7 @@ def gradiobee(
|
|
225 |
)
|
226 |
return error_msg(msg, "info ")
|
227 |
try:
|
|
|
228 |
vec1 = model_s.encode(list1)
|
229 |
vec2 = model_s.encode(list2)
|
230 |
# cmat = vec1.dot(vec2.T)
|
|
|
1 |
"""Gradiobee."""
|
2 |
+
# pylint: disable=invalid-name, too-many-arguments, too-many-branches, too-many-locals, too-many-statements, unused-variable, too-many-return-statements, unused-import
|
3 |
from pathlib import Path
|
4 |
import platform
|
5 |
import inspect
|
|
|
12 |
from logzero import logger
|
13 |
from icecream import ic
|
14 |
|
15 |
+
import numpy as np # noqa
|
16 |
import pandas as pd
|
17 |
+
import matplotlib # noqa
|
18 |
import matplotlib.pyplot as plt
|
19 |
import seaborn as sns
|
20 |
|
|
|
32 |
|
33 |
uname = platform.uname()
|
34 |
HFSPACES = False
|
35 |
+
if "amzn2" in uname.release: # on hf spaces
|
|
|
36 |
HFSPACES = True
|
|
|
|
|
37 |
|
38 |
sns.set()
|
39 |
sns.set_style("darkgrid")
|
|
|
188 |
|
189 |
logger.debug("lang1: %s, lang2: %s", lang1, lang2)
|
190 |
if debug:
|
191 |
+
ic(f" lang1: {lang1}, lang2: {lang2}")
|
192 |
+
ic("fast track? ", lang1 in lang_en_zh and lang2 in lang_en_zh)
|
193 |
|
194 |
# fast track
|
195 |
if lang1 in lang_en_zh and lang2 in lang_en_zh:
|
|
|
222 |
)
|
223 |
return error_msg(msg, "info ")
|
224 |
try:
|
225 |
+
from radiobee.model_s import model_s # pylint: disable=import-outside-toplevel
|
226 |
vec1 = model_s.encode(list1)
|
227 |
vec2 = model_s.encode(list2)
|
228 |
# cmat = vec1.dot(vec2.T)
|
radiobee/interpolate_pset.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Interpolate np.nan."""
|
|
|
2 |
from typing import List, Tuple
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
|
|
1 |
"""Interpolate np.nan."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
from typing import List, Tuple
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
radiobee/lists2cmat.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
"""Convert two lists of str (texts) to correlation matrix."""
|
2 |
-
#
|
|
|
3 |
from typing import Dict, Iterable, List, Optional, Union # noqa
|
4 |
|
5 |
import numpy as np
|
@@ -32,6 +33,26 @@ def lists2cmat(
|
|
32 |
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
|
33 |
) -> np.ndarray:
|
34 |
# fmt: on
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
if isinstance(text1, str):
|
36 |
text1 = [text1]
|
37 |
if isinstance(text2, str):
|
|
|
1 |
"""Convert two lists of str (texts) to correlation matrix."""
|
2 |
+
# pylint: disable=too-many-arguments, too-many-locals, unused-import
|
3 |
+
|
4 |
from typing import Dict, Iterable, List, Optional, Union # noqa
|
5 |
|
6 |
import numpy as np
|
|
|
33 |
vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
|
34 |
) -> np.ndarray:
|
35 |
# fmt: on
|
36 |
+
"""Convert two lists to cmat.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
text1: refer smatrix
|
40 |
+
text2: refer smatrix
|
41 |
+
lang1: optional 1st lang code
|
42 |
+
lang2: optional 2nd lang code
|
43 |
+
dl_type: doc lenth
|
44 |
+
idf_type: idf tyoe
|
45 |
+
max_df: max doc freq
|
46 |
+
max_n_terms: max n terms
|
47 |
+
min_df: min doc freq
|
48 |
+
model: optional model
|
49 |
+
norm: norm
|
50 |
+
tf_type: term freq type
|
51 |
+
vocabulary_terms: vocab refer smatrix
|
52 |
+
|
53 |
+
Returs
|
54 |
+
cmat
|
55 |
+
"""
|
56 |
if isinstance(text1, str):
|
57 |
text1 = [text1]
|
58 |
if isinstance(text2, str):
|
radiobee/loadtext.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
-
"""
|
2 |
-
Load file content to text.
|
3 |
|
4 |
Check encoding and load a file to text.
|
5 |
|
@@ -16,6 +15,8 @@ magic.from_file("testdata/test.pdf")
|
|
16 |
original load_textrev
|
17 |
refer to load_paras.py
|
18 |
"""
|
|
|
|
|
19 |
from typing import Optional, Union # noqa
|
20 |
from pathlib import Path
|
21 |
import cchardet
|
@@ -34,7 +35,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
|
|
34 |
if not filepath.is_file():
|
35 |
logger.error(" file [%s] does not exist or is not a file.", filepath)
|
36 |
# return None
|
37 |
-
raise Exception(" file [
|
38 |
|
39 |
# encoding = detect_file(filepath)
|
40 |
encoding = cchardet.detect(filepath.read_bytes()).get("encoding", "utf8")
|
@@ -44,7 +45,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
|
|
44 |
|
45 |
# cchardet: 'GB18030', no need for errors="ignore"
|
46 |
try:
|
47 |
-
text = filepath.read_text(encoding, errors="ignore")
|
48 |
except Exception as exc:
|
49 |
logger.error(" Opening %s resulted in errors: %s", filepath, exc)
|
50 |
raise
|
@@ -53,8 +54,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
|
|
53 |
|
54 |
|
55 |
def test1():
|
56 |
-
r"""
|
57 |
-
Tests default file.
|
58 |
|
59 |
defaultdir = r'D:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
|
60 |
defaultfile = r'notes pyqt tkinter tktable.txt'
|
@@ -69,10 +69,11 @@ def test1():
|
|
69 |
|
70 |
|
71 |
def testgb():
|
72 |
-
r"""
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
76 |
text = loadtext(file)
|
77 |
if text:
|
78 |
# assert len(text) == 190913
|
@@ -84,10 +85,8 @@ def testgb():
|
|
84 |
assert text0 == text[:500]
|
85 |
|
86 |
|
87 |
-
def
|
88 |
-
r"""
|
89 |
-
Test 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'.
|
90 |
-
"""
|
91 |
# file = 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt' # NOQA
|
92 |
file = r"C:\dl\Dropbox\mat-dir\snippets-mat\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt" # NOQA
|
93 |
file = r"C:\dl\Dropbox\mat-dir\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"
|
|
|
1 |
+
"""Load file content to text.
|
|
|
2 |
|
3 |
Check encoding and load a file to text.
|
4 |
|
|
|
15 |
original load_textrev
|
16 |
refer to load_paras.py
|
17 |
"""
|
18 |
+
# pylint: disable=line-too-long, unused-variable, unused-import
|
19 |
+
|
20 |
from typing import Optional, Union # noqa
|
21 |
from pathlib import Path
|
22 |
import cchardet
|
|
|
35 |
if not filepath.is_file():
|
36 |
logger.error(" file [%s] does not exist or is not a file.", filepath)
|
37 |
# return None
|
38 |
+
raise Exception(" file [{filepath}] does not exist or is not a file.")
|
39 |
|
40 |
# encoding = detect_file(filepath)
|
41 |
encoding = cchardet.detect(filepath.read_bytes()).get("encoding", "utf8")
|
|
|
45 |
|
46 |
# cchardet: 'GB18030', no need for errors="ignore"
|
47 |
try:
|
48 |
+
text = filepath.read_text(encoding=encoding, errors="ignore")
|
49 |
except Exception as exc:
|
50 |
logger.error(" Opening %s resulted in errors: %s", filepath, exc)
|
51 |
raise
|
|
|
54 |
|
55 |
|
56 |
def test1():
|
57 |
+
r"""Tests default file.
|
|
|
58 |
|
59 |
defaultdir = r'D:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
|
60 |
defaultfile = r'notes pyqt tkinter tktable.txt'
|
|
|
69 |
|
70 |
|
71 |
def testgb():
|
72 |
+
r"""Tests shuangyu_ku\txt-books\19部世界名著中英文对照版TXT."""
|
73 |
+
file = (
|
74 |
+
r"C:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT"
|
75 |
+
r"\爱丽丝漫游奇境记.txt"
|
76 |
+
)
|
77 |
text = loadtext(file)
|
78 |
if text:
|
79 |
# assert len(text) == 190913
|
|
|
85 |
assert text0 == text[:500]
|
86 |
|
87 |
|
88 |
+
def test_utf_16le():
|
89 |
+
r"""Test 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'."""
|
|
|
|
|
90 |
# file = 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt' # NOQA
|
91 |
file = r"C:\dl\Dropbox\mat-dir\snippets-mat\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt" # NOQA
|
92 |
file = r"C:\dl\Dropbox\mat-dir\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"
|
radiobee/mdx_e2c.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
|
4 |
mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
|
5 |
"""
|
|
|
6 |
from pathlib import Path
|
7 |
from string import punctuation
|
8 |
import joblib
|
|
|
3 |
mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
|
4 |
mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
|
5 |
"""
|
6 |
+
# pylint: disable=invalid-name,
|
7 |
from pathlib import Path
|
8 |
from string import punctuation
|
9 |
import joblib
|
radiobee/model_s.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Load model_s."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
+
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import joblib
|
7 |
+
from huggingface_hub import hf_hub_url, cached_download # hf_hub_download,
|
8 |
+
from alive_progress import alive_bar
|
9 |
+
from logzero import logger
|
10 |
+
|
11 |
+
|
12 |
+
def load_model_s():
|
13 |
+
"""Load local model_s if present, else fetch from hf.co."""
|
14 |
+
file_loc = "radiobee/model_s"
|
15 |
+
if Path(file_loc).exists():
|
16 |
+
# raise Exception(f"File {file_loc} does not exist.")
|
17 |
+
|
18 |
+
with alive_bar(1, title=" Loading model_s, takes ~30 secs ...", length=3) as progress_bar:
|
19 |
+
model = joblib.load(file_loc)
|
20 |
+
|
21 |
+
# model_s = pickle.load(open(file_loc, "rb"))
|
22 |
+
progress_bar() # pylint: disable=not-callable
|
23 |
+
|
24 |
+
return model
|
25 |
+
|
26 |
+
logger.info(
|
27 |
+
"Fetching and caching model_s from huggingface.co... "
|
28 |
+
"The first time may take a while depending on your net."
|
29 |
+
)
|
30 |
+
with alive_bar(1, title=" Subsequent loading takes ~20 secs ...", length=3) as progress_bar:
|
31 |
+
model = joblib.load(cached_download(hf_hub_url("mikeee/model_s", "model_s")))
|
32 |
+
progress_bar() # pylint: disable=not-callable
|
33 |
+
|
34 |
+
return model
|
35 |
+
|
36 |
+
|
37 |
+
model_s = load_model_s()
|
radiobee/paras2sents.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Convert paras to sents."""
|
2 |
+
# pylint: disable=unused-import, too-many-branches, ungrouped-imports
|
3 |
+
|
4 |
+
from typing import Callable, List, Optional, Tuple, Union
|
5 |
+
|
6 |
+
from itertools import zip_longest
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
from logzero import logger
|
10 |
+
|
11 |
+
from radiobee.align_sents import align_sents
|
12 |
+
from radiobee.seg_text import seg_text
|
13 |
+
from radiobee.detect import detect
|
14 |
+
|
15 |
+
try:
|
16 |
+
from radiobee.shuffle_sents import shuffle_sents
|
17 |
+
except Exception as exc:
|
18 |
+
logger.error("shuffle_sents not available: %s, using align_sents", exc)
|
19 |
+
shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2) # noqa
|
20 |
+
|
21 |
+
|
22 |
+
def paras2sents(
|
23 |
+
paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
|
24 |
+
align_func: Optional[Union[Callable, str]] = None,
|
25 |
+
lang1: Optional[str] = None,
|
26 |
+
lang2: Optional[str] = None,
|
27 |
+
) -> List[Tuple[str, str, Union[str, float]]]:
|
28 |
+
"""Convert paras to sents using align_func.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
paras_: list of 3-tuples or numpy or pd.DataFrame
|
32 |
+
lang1: fisrt lang code
|
33 |
+
lang2: second lang code
|
34 |
+
align_func: func used in the sent level
|
35 |
+
if set to None, default to align_sents
|
36 |
+
Returns:
|
37 |
+
list of sents (possible with likelihood for shuffle_sents)
|
38 |
+
"""
|
39 |
+
# wrap everything in pd.DataFrame
|
40 |
+
# necessary to make pyright happy
|
41 |
+
paras = pd.DataFrame(paras_).fillna("")
|
42 |
+
|
43 |
+
# take the first three columns at maximum
|
44 |
+
paras = paras.iloc[:, :3]
|
45 |
+
|
46 |
+
if len(paras.columns) < 2:
|
47 |
+
logger.error(
|
48 |
+
"Need at least two columns, got %s",
|
49 |
+
len(paras.columns)
|
50 |
+
)
|
51 |
+
raise Exception("wrong data")
|
52 |
+
|
53 |
+
# append the third col (all "") if there are only two cols
|
54 |
+
if len(paras.columns) < 3:
|
55 |
+
paras.insert(2, "likelihood", [""] * len(paras))
|
56 |
+
|
57 |
+
if lang1 is None:
|
58 |
+
lang1 = detect(" ".join(paras.iloc[:, 0]))
|
59 |
+
if lang2 is None:
|
60 |
+
lang2 = detect(" ".join(paras.iloc[:, 1]))
|
61 |
+
|
62 |
+
left, right = [], []
|
63 |
+
row0, row1 = [], []
|
64 |
+
for elm0, elm1, elm2 in paras.values:
|
65 |
+
sents0 = seg_text(elm0, lang1)
|
66 |
+
sents1 = seg_text(elm1, lang2)
|
67 |
+
if isinstance(elm2, float) and elm2 > 0:
|
68 |
+
if row0 or row1:
|
69 |
+
left.append(row0)
|
70 |
+
right.append(row1)
|
71 |
+
row0, row1 = [], [] # collect and prepare
|
72 |
+
|
73 |
+
if sents0:
|
74 |
+
left.append(sents0)
|
75 |
+
if sents1:
|
76 |
+
right.append(sents1)
|
77 |
+
else:
|
78 |
+
if sents0:
|
79 |
+
row0.extend(sents0)
|
80 |
+
if sents1:
|
81 |
+
row1.extend(sents1)
|
82 |
+
# collect possible last batch
|
83 |
+
if row0 or row1:
|
84 |
+
left.append(row0)
|
85 |
+
right.append(row1)
|
86 |
+
|
87 |
+
# res = [*zip(left, right)]
|
88 |
+
|
89 |
+
# align each batch using align_func
|
90 |
+
|
91 |
+
# ready align_func
|
92 |
+
if align_func is None:
|
93 |
+
align_func = align_sents
|
94 |
+
if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
|
95 |
+
align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2) # noqa
|
96 |
+
else:
|
97 |
+
align_func = align_sents
|
98 |
+
|
99 |
+
res = []
|
100 |
+
for row0, row1 in zip(left, right):
|
101 |
+
try:
|
102 |
+
_ = align_func(row0, row1)
|
103 |
+
except Exception as exc:
|
104 |
+
logger.error("errors: %s, resorting to zip_longest", exc)
|
105 |
+
_ = [*zip_longest(row0, row1, fillvalue="")]
|
106 |
+
|
107 |
+
# res.append(_)
|
108 |
+
res.extend(_)
|
109 |
+
|
110 |
+
return res
|
radiobee/plot_cmat.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
"""Plot pandas.DataFrame with DBSCAN clustering."""
|
2 |
-
# pylint: disable=invalid-name, too-many-arguments
|
|
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
import matplotlib
|
@@ -37,13 +38,13 @@ def plot_cmat(
|
|
37 |
backend: str = "Agg",
|
38 |
showfig: bool = False,
|
39 |
):
|
40 |
-
# ) -> plt:
|
41 |
# fmt: on
|
42 |
"""Plot df with DBSCAN clustering.
|
43 |
|
44 |
Args:
|
45 |
df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
|
46 |
-
|
|
|
47 |
matplotlib.pyplot: for possible use in gradio
|
48 |
|
49 |
plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
|
|
|
1 |
"""Plot pandas.DataFrame with DBSCAN clustering."""
|
2 |
+
# pylint: disable=invalid-name, too-many-arguments, too-many-locals
|
3 |
+
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
import matplotlib
|
|
|
38 |
backend: str = "Agg",
|
39 |
showfig: bool = False,
|
40 |
):
|
|
|
41 |
# fmt: on
|
42 |
"""Plot df with DBSCAN clustering.
|
43 |
|
44 |
Args:
|
45 |
df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
|
46 |
+
|
47 |
+
Returns
|
48 |
matplotlib.pyplot: for possible use in gradio
|
49 |
|
50 |
plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
|
radiobee/plot_df.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
"""Plot pandas.DataFrame with DBSCAN clustering."""
|
2 |
-
# pylint: disable=invalid-name, too-many-arguments
|
3 |
import numpy as np # noqa
|
4 |
import pandas as pd
|
5 |
import matplotlib
|
@@ -38,6 +38,7 @@ def plot_df(
|
|
38 |
|
39 |
Args:
|
40 |
df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
|
|
|
41 |
Returns:
|
42 |
matplotlib.pyplot: for possible use in gradio
|
43 |
|
|
|
1 |
"""Plot pandas.DataFrame with DBSCAN clustering."""
|
2 |
+
# pylint: disable=invalid-name, too-many-arguments, unused-import
|
3 |
import numpy as np # noqa
|
4 |
import pandas as pd
|
5 |
import matplotlib
|
|
|
38 |
|
39 |
Args:
|
40 |
df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
|
41 |
+
|
42 |
Returns:
|
43 |
matplotlib.pyplot: for possible use in gradio
|
44 |
|
radiobee/process_upload.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
"""Process uploads."""
|
|
|
2 |
from typing import Union
|
3 |
|
4 |
from pathlib import Path
|
@@ -51,7 +52,7 @@ def process_upload(upload: Union[tempfile._TemporaryFileWrapper, bytes]) -> str:
|
|
51 |
|
52 |
if encoding is not None:
|
53 |
try:
|
54 |
-
text = fpath.read_text(encoding)
|
55 |
except Exception as e:
|
56 |
logger.error("Unable to retrieve text, error: %s", e)
|
57 |
text = str(e)
|
@@ -63,7 +64,7 @@ def process_upload(upload: Union[tempfile._TemporaryFileWrapper, bytes]) -> str:
|
|
63 |
# not able to cchardet: encoding is None, docx, pdf, epub, zip etc
|
64 |
logger.info("Trying docx...to be implemented")
|
65 |
|
66 |
-
#
|
67 |
|
68 |
_ = Path(upload.name)
|
69 |
msg = f"binary file: {_.stem[:-8]}{_.suffix}"
|
|
|
1 |
"""Process uploads."""
|
2 |
+
# pylint: disable=invalid-name, unused-import
|
3 |
from typing import Union
|
4 |
|
5 |
from pathlib import Path
|
|
|
52 |
|
53 |
if encoding is not None:
|
54 |
try:
|
55 |
+
text = fpath.read_text(encoding=encoding)
|
56 |
except Exception as e:
|
57 |
logger.error("Unable to retrieve text, error: %s", e)
|
58 |
text = str(e)
|
|
|
64 |
# not able to cchardet: encoding is None, docx, pdf, epub, zip etc
|
65 |
logger.info("Trying docx...to be implemented")
|
66 |
|
67 |
+
# T ODO .docx .epub .mobi .pdf etc.
|
68 |
|
69 |
_ = Path(upload.name)
|
70 |
msg = f"binary file: {_.stem[:-8]}{_.suffix}"
|
radiobee/seg_text.py
CHANGED
@@ -97,6 +97,8 @@ def seg_text(
|
|
97 |
|
98 |
Arguments:
|
99 |
lst: text or text list
|
|
|
|
|
100 |
extra: re.split(rf"{extra}, text) first
|
101 |
Returns:
|
102 |
list of splitted text.
|
|
|
97 |
|
98 |
Arguments:
|
99 |
lst: text or text list
|
100 |
+
lang: optional lang code
|
101 |
+
maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
|
102 |
extra: re.split(rf"{extra}, text) first
|
103 |
Returns:
|
104 |
list of splitted text.
|
radiobee/shuffle_sents.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
"""Shuffle sents."""
|
2 |
-
# pylint: disable=
|
3 |
|
4 |
from typing import List, Optional, Tuple, Union
|
5 |
|
|
|
6 |
from fastlid import fastlid
|
7 |
from logzero import logger # noqa
|
8 |
|
@@ -26,12 +27,23 @@ def shuffle_sents(
|
|
26 |
lang2: Optional[str] = None,
|
27 |
) -> List[Tuple[str, str, Union[str, float]]]:
|
28 |
# fmt: on
|
29 |
-
"""
|
30 |
|
31 |
Based on __main__.py.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
"""
|
33 |
set_languages = fastlid.set_languages
|
34 |
-
fastlid.set_languages = ["en", "zh"]
|
|
|
|
|
35 |
if lang1 is None:
|
36 |
lang1, _ = fastlid(" ".join(lst1))
|
37 |
if lang2 is None:
|
@@ -40,16 +52,28 @@ def shuffle_sents(
|
|
40 |
# restore fastlid.set_languages
|
41 |
fastlid.set_languages = set_languages
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
|
54 |
pset = gen_pset(
|
55 |
cmat,
|
@@ -63,6 +87,11 @@ def shuffle_sents(
|
|
63 |
|
64 |
final_list = align_texts(aset, lst2, lst1)
|
65 |
|
66 |
-
return final_list
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
|
|
|
1 |
"""Shuffle sents."""
|
2 |
+
# pylint: disable=unused-import, too-many-arguments, too-many-locals,
|
3 |
|
4 |
from typing import List, Optional, Tuple, Union
|
5 |
|
6 |
+
import pandas as pd
|
7 |
from fastlid import fastlid
|
8 |
from logzero import logger # noqa
|
9 |
|
|
|
27 |
lang2: Optional[str] = None,
|
28 |
) -> List[Tuple[str, str, Union[str, float]]]:
|
29 |
# fmt: on
|
30 |
+
"""Shuffle sents to the right positions.
|
31 |
|
32 |
Based on __main__.py.
|
33 |
+
|
34 |
+
eps: float = 6
|
35 |
+
min_samples: int = 4
|
36 |
+
tf_type: str = "linear"
|
37 |
+
idf_type: Optional[str] = None
|
38 |
+
dl_type: Optional[str] = None
|
39 |
+
norm: Optional[str] = None
|
40 |
+
lang1: Optional[str] = "en"
|
41 |
+
lang2: Optional[str] = "zh"
|
42 |
"""
|
43 |
set_languages = fastlid.set_languages
|
44 |
+
# fastlid.set_languages = ["en", "zh"]
|
45 |
+
fastlid.set_languages = None
|
46 |
+
|
47 |
if lang1 is None:
|
48 |
lang1, _ = fastlid(" ".join(lst1))
|
49 |
if lang2 is None:
|
|
|
52 |
# restore fastlid.set_languages
|
53 |
fastlid.set_languages = set_languages
|
54 |
|
55 |
+
lang_dicts = ["en", "zh"]
|
56 |
+
if lang1 in lang_dicts and lang2 in lang_dicts:
|
57 |
+
cmat = lists2cmat(
|
58 |
+
lst1,
|
59 |
+
lst2,
|
60 |
+
tf_type=tf_type,
|
61 |
+
idf_type=idf_type,
|
62 |
+
dl_type=dl_type,
|
63 |
+
norm=norm,
|
64 |
+
lang1=lang1,
|
65 |
+
lang2=lang2,
|
66 |
+
)
|
67 |
+
else: # use model_s
|
68 |
+
from radiobee.model_s import model_s # pylint: disable=import-outside-toplevel
|
69 |
+
vec1 = model_s.encode(lst1)
|
70 |
+
vec2 = model_s.encode(lst2)
|
71 |
+
# cmat = vec1.dot(vec2.T)
|
72 |
+
cmat = vec2.dot(vec1.T)
|
73 |
+
|
74 |
+
shuffle_sents.cmat = cmat
|
75 |
+
shuffle_sents.lang1 = lang1
|
76 |
+
shuffle_sents.lang2 = lang2
|
77 |
|
78 |
pset = gen_pset(
|
79 |
cmat,
|
|
|
87 |
|
88 |
final_list = align_texts(aset, lst2, lst1)
|
89 |
|
90 |
+
# return final_list
|
91 |
+
|
92 |
+
# swap columns 0, 1
|
93 |
+
_ = pd.DataFrame(final_list)
|
94 |
+
|
95 |
+
_ = _.iloc[:, [1, 0] + [*range(2, _.shape[1])]]
|
96 |
|
97 |
+
return _.to_numpy().tolist()
|
radiobee/smatrix.py
CHANGED
@@ -3,13 +3,16 @@
|
|
3 |
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
|
4 |
originally docterm_scores.py.
|
5 |
"""
|
|
|
|
|
6 |
from typing import Dict, Iterable, Optional, Union
|
7 |
-
import numpy as np
|
8 |
from itertools import chain
|
|
|
9 |
from psutil import virtual_memory
|
10 |
from more_itertools import ilen
|
11 |
|
12 |
from textacy.representations import Vectorizer
|
|
|
13 |
# from textacy.representations.vectorizers import Vectorizer
|
14 |
from logzero import logger
|
15 |
|
@@ -51,8 +54,8 @@ def smatrix(
|
|
51 |
for xelm in iter(doc1):
|
52 |
for elm in iter(xelm):
|
53 |
assert isinstance(elm, str)
|
54 |
-
except AssertionError:
|
55 |
-
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ")
|
56 |
except Exception as e:
|
57 |
logger.error(e)
|
58 |
raise
|
@@ -60,8 +63,8 @@ def smatrix(
|
|
60 |
for xelm in iter(doc2):
|
61 |
for elm in iter(xelm):
|
62 |
assert isinstance(elm, str)
|
63 |
-
except AssertionError:
|
64 |
-
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ")
|
65 |
except Exception as e:
|
66 |
logger.error(e)
|
67 |
raise
|
|
|
3 |
refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
|
4 |
originally docterm_scores.py.
|
5 |
"""
|
6 |
+
# pylint: disable=invalid-name, too-many-locals, too-many-arguments
|
7 |
+
|
8 |
from typing import Dict, Iterable, Optional, Union
|
|
|
9 |
from itertools import chain
|
10 |
+
import numpy as np
|
11 |
from psutil import virtual_memory
|
12 |
from more_itertools import ilen
|
13 |
|
14 |
from textacy.representations import Vectorizer
|
15 |
+
|
16 |
# from textacy.representations.vectorizers import Vectorizer
|
17 |
from logzero import logger
|
18 |
|
|
|
54 |
for xelm in iter(doc1):
|
55 |
for elm in iter(xelm):
|
56 |
assert isinstance(elm, str)
|
57 |
+
except AssertionError as exc:
|
58 |
+
raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") from exc
|
59 |
except Exception as e:
|
60 |
logger.error(e)
|
61 |
raise
|
|
|
63 |
for xelm in iter(doc2):
|
64 |
for elm in iter(xelm):
|
65 |
assert isinstance(elm, str)
|
66 |
+
except AssertionError as exc:
|
67 |
+
raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") from exc
|
68 |
except Exception as e:
|
69 |
logger.error(e)
|
70 |
raise
|
radiobee/text2lists.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
"""Separate text to zh en lists."""
|
2 |
-
# pylint: disable=
|
|
|
3 |
|
4 |
# from typing import Tuple,
|
5 |
from typing import Iterable, List, Optional, Tuple, Union # noqa
|
|
|
1 |
"""Separate text to zh en lists."""
|
2 |
+
# pylint: disable=unused-import, too-many-locals, invalid-name, too-many-branches, too-many-statements,
|
3 |
+
|
4 |
|
5 |
# from typing import Tuple,
|
6 |
from typing import Iterable, List, Optional, Tuple, Union # noqa
|
requirements.txt
CHANGED
@@ -24,4 +24,6 @@ pycld2
|
|
24 |
tqdm
|
25 |
polyglot
|
26 |
sentence_splitter
|
27 |
-
icecream
|
|
|
|
|
|
24 |
tqdm
|
25 |
polyglot
|
26 |
sentence_splitter
|
27 |
+
icecream
|
28 |
+
# lazy
|
29 |
+
alive-progress
|
run-pydocstle.bat
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pydocstyle --convention=google radiobee tests
|
run-pylint.bat
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pylint radiobee -d duplicate-code
|
tests/test_align_sents.py
CHANGED
@@ -1,9 +1,14 @@
|
|
1 |
"""Test align_sents."""
|
2 |
from radiobee.align_sents import align_sents
|
|
|
3 |
|
|
|
|
|
|
|
4 |
|
5 |
-
|
6 |
-
|
|
|
7 |
lst1, lst2 = [
|
8 |
"a",
|
9 |
"bs",
|
@@ -11,3 +16,56 @@ def test_align_sents():
|
|
11 |
res = align_sents(lst1, lst2)
|
12 |
|
13 |
assert res == [("a", "aaa"), ("a", "34"), ("bs", "a"), ("bs", "b")]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""Test align_sents."""
|
2 |
from radiobee.align_sents import align_sents
|
3 |
+
from radiobee.seg_text import seg_text
|
4 |
|
5 |
+
text1 = """`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn."""
|
6 |
+
text2 = """“被囚禁的囚犯!”我在精神上被射精,“你应该永远与你的物种隔绝,因为你这种粗鲁的病态。至少,我白天不会锁门,我不在乎,我进去了!”我决心如此,我抓住了门锁,狠狠地摇了一下。醋脸的约瑟夫从谷仓的圆窗朝他的头照射。"""
|
7 |
+
text3 = """"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit. Zumindest würde ich meine Türen tagsüber nicht verriegeln. Das ist mir egal - ich werde reinkommen!' So entschlossen, ergriff ich die Klinke und rüttelte heftig daran. Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune."""
|
8 |
|
9 |
+
|
10 |
+
def test_align_sents_sanity():
|
11 |
+
"""Test align_sents sanity check."""
|
12 |
lst1, lst2 = [
|
13 |
"a",
|
14 |
"bs",
|
|
|
16 |
res = align_sents(lst1, lst2)
|
17 |
|
18 |
assert res == [("a", "aaa"), ("a", "34"), ("bs", "a"), ("bs", "b")]
|
19 |
+
|
20 |
+
|
21 |
+
def test_align_sents_en_zh():
|
22 |
+
"""Test align_sents en-zh."""
|
23 |
+
sents_en = seg_text(text1)
|
24 |
+
sents_zh = seg_text(text2)
|
25 |
+
|
26 |
+
# 9ms vs shuffle_sents 50ms shuffle_sents wth lang1lang2 40ms
|
27 |
+
res = align_sents(sents_en, sents_zh)
|
28 |
+
|
29 |
+
_ = """res[2:4]
|
30 |
+
Out[26]:
|
31 |
+
[('At least, I would not keep my doors barred in the day time.',
|
32 |
+
'至少,我白天不会锁门,我不在乎,我进去了!”'),
|
33 |
+
("I don't care--I will get in!'", '至少,我白天不会锁门,我不在乎,我进去了!”')]
|
34 |
+
"""
|
35 |
+
assert "至少" in str(res[2])
|
36 |
+
assert "至少" in str(res[3])
|
37 |
+
|
38 |
+
|
39 |
+
def test_align_sents_en_de():
|
40 |
+
"""Test align_sents en-zh."""
|
41 |
+
sents_en = seg_text(text1)
|
42 |
+
sents_de = seg_text(text3)
|
43 |
+
|
44 |
+
res1 = align_sents(sents_en, sents_de)
|
45 |
+
_ = """In [48]: res1[:2]
|
46 |
+
Out[48]:
|
47 |
+
[("`Wretched inmates!'",
|
48 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
|
49 |
+
('I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
|
50 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.')]
|
51 |
+
"""
|
52 |
+
assert "Elende" in str(res1[0])
|
53 |
+
assert "Elende" in str(res1[1])
|
54 |
+
|
55 |
+
|
56 |
+
_ = """
|
57 |
+
[("`Wretched inmates!'",
|
58 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
|
59 |
+
('I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
|
60 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
|
61 |
+
('At least, I would not keep my doors barred in the day time.',
|
62 |
+
'Zumindest würde ich meine Türen tagsüber nicht verriegeln.'),
|
63 |
+
("I don't care--I will get in!'",
|
64 |
+
"Das ist mir egal - ich werde reinkommen!'"),
|
65 |
+
('So resolved, I grasped the latch and shook it vehemently.',
|
66 |
+
'So entschlossen, ergriff ich die Klinke und rüttelte heftig daran.'),
|
67 |
+
('Vinegar-faced Joseph projected his head from a round window of the barn.',
|
68 |
+
'Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune.')]
|
69 |
+
|
70 |
+
|
71 |
+
"""
|
tests/test_lists2cmat_hlm.py
CHANGED
@@ -37,9 +37,9 @@ def test_lists2cmat_hlm():
|
|
37 |
# cmat = texts2cmat(lst1, lst2, lang1, lang2)
|
38 |
cmat = lists2cmat(lst1, lst2, lang1, lang2)
|
39 |
|
40 |
-
assert cmat.shape == (
|
41 |
|
42 |
cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
|
43 |
|
44 |
-
assert cmat21.shape == (
|
45 |
assert lists2cmat(lst2, lst1).mean() > 0.05 # 0.09
|
|
|
37 |
# cmat = texts2cmat(lst1, lst2, lang1, lang2)
|
38 |
cmat = lists2cmat(lst1, lst2, lang1, lang2)
|
39 |
|
40 |
+
assert cmat.shape == (55, 135)
|
41 |
|
42 |
cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
|
43 |
|
44 |
+
assert cmat21.shape == (135, 55)
|
45 |
assert lists2cmat(lst2, lst1).mean() > 0.05 # 0.09
|
tests/test_paras2sents.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Test paras2sents."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from radiobee.paras2sents import paras2sents
|
6 |
+
from radiobee.shuffle_sents import shuffle_sents
|
7 |
+
|
8 |
+
file_loc = r"data/test-dual-zh-en.xlsx"
|
9 |
+
paras = pd.read_excel(file_loc, header=0)
|
10 |
+
paras = paras[["text1", "text2", "likelihood"]].fillna("")
|
11 |
+
|
12 |
+
|
13 |
+
def test_paras2sents_dual():
|
14 |
+
"""Test paras2sents_dual."""
|
15 |
+
sents = paras2sents(paras)
|
16 |
+
|
17 |
+
assert len(sents) > 202 # 208
|
18 |
+
# assert not sents
|
19 |
+
|
20 |
+
|
21 |
+
def test_paras2sents_dual_model_s():
|
22 |
+
"""Test paras2sents_dual_model_s."""
|
23 |
+
sents = paras2sents(paras, shuffle_sents)
|
24 |
+
|
25 |
+
assert len(sents) > 201 # 207
|
26 |
+
# assert not sents
|
27 |
+
|
28 |
+
|
29 |
+
_ = """
|
30 |
+
df = pd.DataFrame(
|
31 |
+
[list(sent) + [""] if len(sent) == 2 else list(sent) for sent in sents]
|
32 |
+
).fillna("")
|
33 |
+
|
34 |
+
"""
|
tests/test_shuffle_sents.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Test shuffle_sents.
|
2 |
+
|
3 |
+
eps: float = 6
|
4 |
+
min_samples: int = 4
|
5 |
+
tf_type: str = "linear"
|
6 |
+
idf_type: Optional[str] = None
|
7 |
+
dl_type: Optional[str] = None
|
8 |
+
norm: Optional[str] = None
|
9 |
+
lang1: Optional[str] = "en"
|
10 |
+
lang2: Optional[str] = "zh"
|
11 |
+
"""
|
12 |
+
from radiobee.seg_text import seg_text
|
13 |
+
from radiobee.shuffle_sents import shuffle_sents
|
14 |
+
from radiobee.align_sents import align_sents
|
15 |
+
|
16 |
+
text1 = """`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn."""
|
17 |
+
text2 = """“被囚禁的囚犯!”我在精神上被射精,“你应该永远与你的物种隔绝,因为你这种粗鲁的病态。至少,我白天不会锁门,我不在乎,我进去了!”我决心如此,我抓住了门锁,狠狠地摇了一下。醋脸的约瑟夫从谷仓的圆窗朝他的头照射。"""
|
18 |
+
text3 = """"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit. Zumindest würde ich meine Türen tagsüber nicht verriegeln. Das ist mir egal - ich werde reinkommen!' So entschlossen, ergriff ich die Klinke und rüttelte heftig daran. Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune."""
|
19 |
+
|
20 |
+
|
21 |
+
def test_shuffle_sents_en_zh():
|
22 |
+
"""Test shuffle_sents_en_zh."""
|
23 |
+
sents_en = seg_text(text1)
|
24 |
+
sents_zh = seg_text(text2)
|
25 |
+
|
26 |
+
lang1 = "en"
|
27 |
+
lang2 = "zh"
|
28 |
+
|
29 |
+
pairs = shuffle_sents(sents_en, sents_zh)
|
30 |
+
pairs_ = shuffle_sents(sents_en, sents_zh, lang1=lang1, lang2=lang2)
|
31 |
+
|
32 |
+
# pairs[3] == ('', "I don't care--I will get in!'", '')
|
33 |
+
assert pairs == pairs_
|
34 |
+
|
35 |
+
# assert not pairs[3][0]
|
36 |
+
# after swapping
|
37 |
+
assert not pairs[3][1]
|
38 |
+
|
39 |
+
|
40 |
+
def test_shuffle_sents_en_de():
|
41 |
+
"""Test shuffle_sents_en_de."""
|
42 |
+
sents_en = seg_text(text1)
|
43 |
+
sents_de = seg_text(text3)
|
44 |
+
|
45 |
+
lang1 = "en"
|
46 |
+
lang2 = "de"
|
47 |
+
|
48 |
+
pairs = shuffle_sents(sents_en, sents_de)
|
49 |
+
pairs_ = shuffle_sents(sents_en, sents_de, lang1=lang1, lang2=lang2)
|
50 |
+
|
51 |
+
assert pairs == pairs_
|
52 |
+
|
53 |
+
#
|
54 |
+
# assert not pairs[3][0]
|
55 |
+
_ = """In [218]: pairs[:2]
|
56 |
+
Out[218]:
|
57 |
+
[["`Wretched inmates!'", '', ''],
|
58 |
+
['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
|
59 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
|
60 |
+
0.62]]
|
61 |
+
"""
|
62 |
+
assert not pairs[0][1]
|
63 |
+
assert "mentally" in str(pairs[1]) and "Elende" in str(pairs[1])
|
64 |
+
|
65 |
+
# [elm[2] for elm in pairs]
|
66 |
+
# ['', 0.62, 0.72, 0.74, 0.68, 0.79]
|
67 |
+
if isinstance(pairs[1][2], float):
|
68 |
+
assert pairs[1][2] > 0.6
|
69 |
+
if isinstance(pairs[2][2], float):
|
70 |
+
assert pairs[2][2] > 0.7
|
71 |
+
if isinstance(pairs[3][2], float):
|
72 |
+
assert pairs[3][2] > 0.7
|
73 |
+
if isinstance(pairs[4][2], float):
|
74 |
+
assert pairs[4][2] > 0.6
|
75 |
+
if isinstance(pairs[5][2], float):
|
76 |
+
assert pairs[5][2] > 0.7
|
77 |
+
|
78 |
+
|
79 |
+
_ = """
|
80 |
+
In [232]: shuffle_sents.cmat.round(2)
|
81 |
+
Out[232]:
|
82 |
+
array([[ 0.27, 0.62, 0.07, 0.11, 0.02, 0.02],
|
83 |
+
[ 0.03, 0.09, 0.72, 0.18, 0.07, -0.07],
|
84 |
+
[ 0.19, 0.07, 0.16, 0.74, -0.01, -0.02],
|
85 |
+
[-0.02, 0.18, 0.16, 0.06, 0.68, -0.04],
|
86 |
+
[ 0.02, 0.07, 0.04, -0.04, 0.02, 0.79]], dtype=float32)
|
87 |
+
pairs[1]
|
88 |
+
sents_en[1], sents_de[0], shuffle_sents.cmat[0, 1]
|
89 |
+
['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
|
90 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
|
91 |
+
0.62]
|
92 |
+
|
93 |
+
pairs[2]
|
94 |
+
sents_en[2], sents_de[1], shuffle_sents.cmat[1, 2].round(2)
|
95 |
+
Out[244]:
|
96 |
+
('At least, I would not keep my doors barred in the day time.',
|
97 |
+
'Zumindest würde ich meine Türen tagsüber nicht verriegeln.',
|
98 |
+
0.72)
|
99 |
+
...
|
100 |
+
|
101 |
+
import mtplotlib
|
102 |
+
import matplotlib.pyplot as plt
|
103 |
+
import seaborn as sns
|
104 |
+
|
105 |
+
sns.set()
|
106 |
+
set_style("darkgrind")
|
107 |
+
plt.ion()
|
108 |
+
|
109 |
+
ali = shuffle_sents(sents_en, sents_de)
|
110 |
+
sns.heatmap(shuffle_sents.cmat, cmap="viridis_r").invert_yaxis()
|
111 |
+
ax = plt.gca()
|
112 |
+
ax.set_xlabel(shuffle_sents.lang1)
|
113 |
+
ax.set_ylabel(shuffle_sents.lang2)
|
114 |
+
|
115 |
+
ali == [["`Wretched inmates!'", '', ''],
|
116 |
+
['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
|
117 |
+
'"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
|
118 |
+
0.62],
|
119 |
+
['At least, I would not keep my doors barred in the day time.',
|
120 |
+
'Zumindest würde ich meine Türen tagsüber nicht verriegeln.',
|
121 |
+
0.72],
|
122 |
+
["I don't care--I will get in!'",
|
123 |
+
"Das ist mir egal - ich werde reinkommen!'",
|
124 |
+
0.74],
|
125 |
+
['So resolved, I grasped the latch and shook it vehemently.',
|
126 |
+
'So entschlossen, ergriff ich die Klinke und rüttelte heftig daran.',
|
127 |
+
0.68],
|
128 |
+
['Vinegar-faced Joseph projected his head from a round window of the barn.',
|
129 |
+
'Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune.',
|
130 |
+
0.79]]
|
131 |
+
|
132 |
+
res1 = align_sents(sents_en, sents_de)
|
133 |
+
ali = shuffle_sents(sents_en, sents_de)
|
134 |
+
for idx in range(1, 6):
|
135 |
+
assert res1[idx] == tuple(ali[idx][:2])
|
136 |
+
"""
|