Spaces:

mikeee
/

radiobee-aligner

Build error

App Files Files Community

freemt commited on Jan 19, 2022

Commit

4c04f50

1 Parent(s): 02e4e96

Update before sent-align

Browse files

Files changed (37) hide show

radiobee/__main__.py +2 -4
radiobee/align_sents.py +4 -2
radiobee/align_texts.py +2 -0
radiobee/amend_avec.py +1 -1
radiobee/app.py +2 -0
radiobee/cmat2tset.py +2 -0
radiobee/docterm_scores.py +7 -5
radiobee/en2zh.py +6 -1
radiobee/error_msg.py +2 -0
radiobee/files2df.py +2 -0
radiobee/gen_aset.py +2 -0
radiobee/gen_eps_minsamples.py +7 -4
radiobee/gen_model.py +5 -6
radiobee/gen_pset.py +7 -2
radiobee/gen_row_alignment.py +1 -1
radiobee/gen_vector.py +2 -2
radiobee/gradiobee.py +7 -9
radiobee/interpolate_pset.py +1 -0
radiobee/lists2cmat.py +22 -1
radiobee/loadtext.py +13 -14
radiobee/mdx_e2c.py +1 -0
radiobee/model_s.py +37 -0
radiobee/paras2sents.py +110 -0
radiobee/plot_cmat.py +4 -3
radiobee/plot_df.py +2 -1
radiobee/process_upload.py +3 -2
radiobee/seg_text.py +2 -0
radiobee/shuffle_sents.py +44 -15
radiobee/smatrix.py +8 -5
radiobee/text2lists.py +2 -1
requirements.txt +3 -1
run-pydocstle.bat +1 -0
run-pylint.bat +1 -0
tests/test_align_sents.py +60 -2
tests/test_lists2cmat_hlm.py +2 -2
tests/test_paras2sents.py +34 -0
tests/test_shuffle_sents.py +136 -0

radiobee/__main__.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """Run interactively."""
-# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
 from typing import Any, Tuple, Optional, Union  # noqa
 import sys
-from pathlib import Path
 import platform
 import signal
 from random import randint
@@ -377,8 +377,6 @@ if __name__ == "__main__":
         """
     ).strip()
-    # "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.11641' target='_blank'>JoJoGAN: One Shot Face Stylization</a>| <a href='https://github.com/mchong6/JoJoGAN' target='_blank'>Github Repo Pytorch</a></p> <center><img src='https://visitor-badge.glitch.me/badge?page_id=akhaliq_jojogan' alt='visitor badge'></center> <p style='text-align: center'>samples from repo: <img src='https://raw.githubusercontent.com/mchong6/JoJoGAN/main/teaser.jpg' alt='animation'/></p>"  # noqa
     article = dedent(
         """ <p style="text-align: center">readiobee docs:
         <a href="https://radiobee.readthedocs.io/" target="_blank">readthedocs</a>

 """Run interactively."""
+# pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, unused-import, wrong-import-position, too-many-locals, too-many-statements
 from typing import Any, Tuple, Optional, Union  # noqa
 import sys
+from pathlib import Path  # noqa
 import platform
 import signal
 from random import randint
         """
     ).strip()
     article = dedent(
         """ <p style="text-align: center">readiobee docs:
         <a href="https://radiobee.readthedocs.io/" target="_blank">readthedocs</a>

radiobee/align_sents.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Align sents via gale-church."""
-# pylint: disable=
 from typing import List, Tuple  # noqa
@@ -38,8 +38,10 @@ def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
     texts = []
     # for elm in aset:
-    for elm0, elm1 in amended_avec:
         # elm0, elm1, elm2 = elm
         _ = []
         # src_text first

 """Align sents via gale-church."""
+# pylint: disable=invalid-name
 from typing import List, Tuple  # noqa
     texts = []
     # for elm in aset:
+    # for elm0, elm1 in amended_avec:
+    for elm in amended_avec:
         # elm0, elm1, elm2 = elm
+        elm0, elm1 = elm[:2]
         _ = []
         # src_text first

radiobee/align_texts.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Align texts based on aset, src_text, tgt_text."""
 from typing import List, Tuple, Union
 from logzero import logger

 """Align texts based on aset, src_text, tgt_text."""
+# pylint: disable=unused-variable
 from typing import List, Tuple, Union
 from logzero import logger

radiobee/amend_avec.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Amend avec from align_block."""
-# pylint: disable=
 from typing import List, Tuple, Union

 """Amend avec from align_block."""
+# pylint: disable=unused-variable, unused-import
 from typing import List, Tuple, Union

radiobee/app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Talk to spaces VM via subprocess.check_output."""
 # import httpx
 import subprocess as sp
 from shlex import split

 """Talk to spaces VM via subprocess.check_output."""
+# pylint: disable=unused-variable, invalid-name
 # import httpx
 import subprocess as sp
 from shlex import split

radiobee/cmat2tset.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Gen triple-set from a  matrix."""
 from typing import List, Tuple, Union  # noqa
 import numpy as np

 """Gen triple-set from a  matrix."""
+# pylint: disable=unused-import
 from typing import List, Tuple, Union  # noqa
 import numpy as np

radiobee/docterm_scores.py CHANGED Viewed

@@ -2,9 +2,11 @@
 refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
 """
 from typing import Dict, Iterable, List, Optional, Union  # noqa
-import numpy as np
 from itertools import chain
 from psutil import virtual_memory
 from more_itertools import ilen
@@ -48,8 +50,8 @@ def docterm_scores(
         for xelm in iter(doc1):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
-    except AssertionError:
-        raise AssertionError(" doc1 is not of the typing  Iterable[Iterable[str]] ")
     except Exception as e:
         logger.error(e)
         raise
@@ -57,8 +59,8 @@ def docterm_scores(
         for xelm in iter(doc2):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
-    except AssertionError:
-        raise AssertionError(" doc2 is not of the typing  Iterable[Iterable[str]] ")
     except Exception as e:
         logger.error(e)
         raise

 refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
 """
+# pylint: disable=too-many-arguments, too-many-locals, invalid-name, unused-import
 from typing import Dict, Iterable, List, Optional, Union  # noqa
 from itertools import chain
+import numpy as np
 from psutil import virtual_memory
 from more_itertools import ilen
         for xelm in iter(doc1):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
+    except AssertionError as exc:
+        raise AssertionError(" doc1 is not of the typing  Iterable[Iterable[str]] ") from exc
     except Exception as e:
         logger.error(e)
         raise
         for xelm in iter(doc2):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
+    except AssertionError as exc:
+        raise AssertionError(" doc2 is not of the typing  Iterable[Iterable[str]] ") from exc
     except Exception as e:
         logger.error(e)
         raise

radiobee/en2zh.py CHANGED Viewed

@@ -5,7 +5,9 @@ from typing import Iterable, List, Union
 import warnings
 import copy
-from radiobee.mdx_e2c import mdx_e2c
 warnings.simplefilter('ignore', DeprecationWarning)
@@ -25,6 +27,9 @@ def en2zh(
     Returns
         res: list of str
     """
     res = copy.deepcopy(text)
     if isinstance(text, str):
         # res = [text.split()]

 import warnings
 import copy
+# from radiobee.mdx_e2c import mdx_e2c  # moved to local for lazy loading
+# from lazy import lazy
 warnings.simplefilter('ignore', DeprecationWarning)
     Returns
         res: list of str
     """
+    # to effect lazy loading
+    from radiobee.mdx_e2c import mdx_e2c  # pylint: disable=import-outside-toplevel
     res = copy.deepcopy(text)
     if isinstance(text, str):
         # res = [text.split()]

radiobee/error_msg.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Prepare an error message for gradiobee."""
 from typing import Optional, Tuple, Union
 import pandas as pd

 """Prepare an error message for gradiobee."""
+# pylint: disable=invalid-name
 from typing import Optional, Tuple, Union
 import pandas as pd

radiobee/files2df.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Convert two iesl to pandas.DataFrame."""
 from itertools import zip_longest
 # import tempfile
 import pandas as pd

 """Convert two iesl to pandas.DataFrame."""
+# pylint: disable=invalid-name
 from itertools import zip_longest
 # import tempfile
 import pandas as pd

radiobee/gen_aset.py CHANGED Viewed

@@ -1,4 +1,6 @@
 """Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
 from typing import List, Tuple, Union
 from itertools import zip_longest

 """Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
+# pylint: disable=unused-variable
 from typing import List, Tuple, Union
 from itertools import zip_longest

radiobee/gen_eps_minsamples.py CHANGED Viewed

@@ -4,10 +4,13 @@
 def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
     """Gen suggested eps min_samples."""
     eps = src_len * 0.01
-    if eps < 3:
-        eps = 3
     min_samples = tgt_len / 100 * 0.5
-    if min_samples < 3:
-        min_samples = 3
     return {"eps": eps, "min_samples": min_samples}

 def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
     """Gen suggested eps min_samples."""
     eps = src_len * 0.01
+    # if eps < 3: eps = 3
+    eps = max(3, eps)
     min_samples = tgt_len / 100 * 0.5
+    # if min_samples < 3: min_samples = 3
+    min_samples = max(3, min_samples)
     return {"eps": eps, "min_samples": min_samples}

radiobee/gen_model.py CHANGED Viewed

@@ -8,6 +8,8 @@ doc_term_matrix
 tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
 """
 from typing import Dict, Iterable, List, Optional, Union  # noqa
 from textacy.representations import Vectorizer
@@ -30,16 +32,13 @@ def gen_model(
     """Generate a model (textacy.representations.Vectorizer).
     Args:
-        doc: tokenized docs
         (refer to textacy.representation.Vectorizer)
         tf_type: Type of term frequency (tf) to use for weights' local component:
             - "linear": tf (tfs are already linear, so left as-is)
             - "sqrt": tf => sqrt(tf)
             - "log": tf => log(tf) + 1
             - "binary": tf => 1
         idf_type: Type of inverse document frequency (idf) to use for weights'
             global component:
@@ -91,8 +90,8 @@ def gen_model(
         for xelm in iter(tokenized_docs):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
-    except AssertionError:
-        raise AssertionError(" tokenized_docs is not of the typing  Iterable[Iterable[str]] ")
     except Exception as e:
         logger.error(e)
         raise

 tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
 """
+# pylint: disable=too-many-arguments, invalid-name, unused-import
 from typing import Dict, Iterable, List, Optional, Union  # noqa
 from textacy.representations import Vectorizer
     """Generate a model (textacy.representations.Vectorizer).
     Args:
+        tokenized_docs: tokenized docs
         (refer to textacy.representation.Vectorizer)
         tf_type: Type of term frequency (tf) to use for weights' local component:
             - "linear": tf (tfs are already linear, so left as-is)
             - "sqrt": tf => sqrt(tf)
             - "log": tf => log(tf) + 1
             - "binary": tf => 1
         idf_type: Type of inverse document frequency (idf) to use for weights'
             global component:
         for xelm in iter(tokenized_docs):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
+    except AssertionError as e:
+        raise AssertionError(" tokenized_docs is not of the typing  Iterable[Iterable[str]] ") from e
     except Exception as e:
         logger.error(e)
         raise

radiobee/gen_pset.py CHANGED Viewed

@@ -2,6 +2,8 @@
 tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
 """
 from typing import List, Tuple, Union
 import numpy as np
@@ -22,6 +24,7 @@ def _gen_pset(
     # ) -> List[Tuple[int, int, Union[float, str]]]:
 ) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
     """Gen pset from cmat.
     Find pairs for a given cmat.
     Args:
@@ -86,8 +89,9 @@ def _gen_pset(
     # low_ = np.min(ymax) - 1  # reset to minimum_value - 1
     buff = [(-1, -1, ""), (tgt_len, src_len, "")]
-    # for _ in range(tgt_len):
-    for idx, tset_elm in enumerate(tset):
         logger.debug("buff: %s", buff)
         # postion max in ymax and insert in buff
         # if with range given by iset+-delta and
@@ -152,6 +156,7 @@ def gen_pset(
     Refer to _gen_pset.
     """
     gen_pset.min_samples = min_samples
     for min_s in range(min_samples):
         logger.debug(" min_samples, try %s", min_samples - min_s)

 tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
 """
+# pylint: disable=too-many-locals, unused-import, invalid-name
 from typing import List, Tuple, Union
 import numpy as np
     # ) -> List[Tuple[int, int, Union[float, str]]]:
 ) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
     """Gen pset from cmat.
     Find pairs for a given cmat.
     Args:
     # low_ = np.min(ymax) - 1  # reset to minimum_value - 1
     buff = [(-1, -1, ""), (tgt_len, src_len, "")]
+    # for idx, tset_elm in enumerate(tset):
+    for tset_elm in tset:
         logger.debug("buff: %s", buff)
         # postion max in ymax and insert in buff
         # if with range given by iset+-delta and
     Refer to _gen_pset.
     """
+    del verbose
     gen_pset.min_samples = min_samples
     for min_s in range(min_samples):
         logger.debug(" min_samples, try %s", min_samples - min_s)

radiobee/gen_row_alignment.py CHANGED Viewed

@@ -35,7 +35,7 @@ idx += 1;  i0, i1, i2 = resu[idx]; '***' if i0 == ''
 else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
 if i2 == '' else i2
 """
-# pylint: disable=line-too-long
 from typing import List, Union
 # natural extrapolation with slope equal to 1

 else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
 if i2 == '' else i2
 """
+# pylint: disable=line-too-long, unused-variable
 from typing import List, Union
 # natural extrapolation with slope equal to 1

radiobee/gen_vector.py CHANGED Viewed

@@ -9,11 +9,11 @@ from radiobee.insert_spaces import insert_spaces
 def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
-    """Gen vector for a give model.
     Args:
         text: string of Chinese chars or English words.
     filename = r"data\test-dual.txt"
     text = loadtext(filename)
     list1, list2 = zip(*text2lists(text))

 def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
+    r"""Gen vector for a give model.
     Args:
         text: string of Chinese chars or English words.
+        model: model used
     filename = r"data\test-dual.txt"
     text = loadtext(filename)
     list1, list2 = zip(*text2lists(text))

radiobee/gradiobee.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Gradiobee."""
-# pylint: disable=invalid-name
 from pathlib import Path
 import platform
 import inspect
@@ -12,9 +12,9 @@ from fastlid import fastlid
 from logzero import logger
 from icecream import ic
-import numpy as np
 import pandas as pd
-import matplotlib
 import matplotlib.pyplot as plt
 import seaborn as sns
@@ -32,11 +32,8 @@ from radiobee.text2lists import text2lists
 uname = platform.uname()
 HFSPACES = False
-# if "amzn2" in uname.release:  # on hf spaces
-if True:
     HFSPACES = True
-    from sentence_transformers import SentenceTransformer  # noqa
-    model_s = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
 sns.set()
 sns.set_style("darkgrid")
@@ -191,8 +188,8 @@ def gradiobee(
     logger.debug("lang1: %s, lang2: %s", lang1, lang2)
     if debug:
-        print("gradiobee.py ln 82 lang1: %s, lang2: %s" % (lang1, lang2))
-        print("fast track? ", lang1 in lang_en_zh and lang2 in lang_en_zh)
     # fast track
     if lang1 in lang_en_zh and lang2 in lang_en_zh:
@@ -225,6 +222,7 @@ def gradiobee(
             )
             return error_msg(msg, "info ")
         try:
             vec1 = model_s.encode(list1)
             vec2 = model_s.encode(list2)
             # cmat = vec1.dot(vec2.T)

 """Gradiobee."""
+# pylint: disable=invalid-name, too-many-arguments, too-many-branches, too-many-locals, too-many-statements, unused-variable, too-many-return-statements, unused-import
 from pathlib import Path
 import platform
 import inspect
 from logzero import logger
 from icecream import ic
+import numpy as np  # noqa
 import pandas as pd
+import matplotlib  # noqa
 import matplotlib.pyplot as plt
 import seaborn as sns
 uname = platform.uname()
 HFSPACES = False
+if "amzn2" in uname.release:  # on hf spaces
     HFSPACES = True
 sns.set()
 sns.set_style("darkgrid")
     logger.debug("lang1: %s, lang2: %s", lang1, lang2)
     if debug:
+        ic(f" lang1: {lang1}, lang2: {lang2}")
+        ic("fast track? ", lang1 in lang_en_zh and lang2 in lang_en_zh)
     # fast track
     if lang1 in lang_en_zh and lang2 in lang_en_zh:
             )
             return error_msg(msg, "info ")
         try:
+            from radiobee.model_s import model_s  # pylint: disable=import-outside-toplevel
             vec1 = model_s.encode(list1)
             vec2 = model_s.encode(list2)
             # cmat = vec1.dot(vec2.T)

radiobee/interpolate_pset.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Interpolate np.nan."""
 from typing import List, Tuple
 import numpy as np
 import pandas as pd

 """Interpolate np.nan."""
+# pylint: disable=invalid-name
 from typing import List, Tuple
 import numpy as np
 import pandas as pd

radiobee/lists2cmat.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Convert two lists of str (texts) to correlation matrix."""
-# from typing import Dict, Iterable, Optional, Union
 from typing import Dict, Iterable, List, Optional, Union  # noqa
 import numpy as np
@@ -32,6 +33,26 @@ def lists2cmat(
         vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
 ) -> np.ndarray:
     # fmt: on
     if isinstance(text1, str):
         text1 = [text1]
     if isinstance(text2, str):

 """Convert two lists of str (texts) to correlation matrix."""
+# pylint: disable=too-many-arguments, too-many-locals, unused-import
 from typing import Dict, Iterable, List, Optional, Union  # noqa
 import numpy as np
         vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
 ) -> np.ndarray:
     # fmt: on
+    """Convert two lists to cmat.
+    Args:
+        text1: refer smatrix
+        text2: refer smatrix
+        lang1: optional 1st lang code
+        lang2: optional 2nd lang code
+        dl_type: doc lenth
+        idf_type: idf tyoe
+        max_df: max doc freq
+        max_n_terms: max n terms
+        min_df: min doc freq
+        model: optional model
+        norm: norm
+        tf_type: term freq type
+        vocabulary_terms: vocab refer smatrix
+    Returs
+        cmat
+    """
     if isinstance(text1, str):
         text1 = [text1]
     if isinstance(text2, str):

radiobee/loadtext.py CHANGED Viewed

@@ -1,5 +1,4 @@
-"""
-Load file content to text.
 Check encoding and load a file to text.
@@ -16,6 +15,8 @@ magic.from_file("testdata/test.pdf")
 original load_textrev
 refer to load_paras.py
 """
 from typing import Optional, Union  # noqa
 from pathlib import Path
 import cchardet
@@ -34,7 +35,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
     if not filepath.is_file():
         logger.error(" file [%s] does not exist or is not a file.", filepath)
         # return None
-        raise Exception(" file [%s] does not exist or is not a file." % filepath)
     # encoding = detect_file(filepath)
     encoding = cchardet.detect(filepath.read_bytes()).get("encoding", "utf8")
@@ -44,7 +45,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
     # cchardet: 'GB18030', no need for errors="ignore"
     try:
-        text = filepath.read_text(encoding, errors="ignore")
     except Exception as exc:
         logger.error(" Opening %s resulted in errors: %s", filepath, exc)
         raise
@@ -53,8 +54,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
 def test1():
-    r"""
-    Tests default file.
     defaultdir = r'D:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
     defaultfile = r'notes pyqt tkinter tktable.txt'
@@ -69,10 +69,11 @@ def test1():
 def testgb():
-    r"""
-    Tests  D:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT
-    """
-    file = r"C:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT" r"\爱丽丝漫游奇境记.txt"
     text = loadtext(file)
     if text:
         # assert len(text) == 190913
@@ -84,10 +85,8 @@ def testgb():
         assert text0 == text[:500]
-def testUTF_16LE():
-    r"""
-    Test  'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'.
-    """
     # file = 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'  # NOQA
     file = r"C:\dl\Dropbox\mat-dir\snippets-mat\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"  # NOQA
     file = r"C:\dl\Dropbox\mat-dir\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"

+"""Load file content to text.
 Check encoding and load a file to text.
 original load_textrev
 refer to load_paras.py
 """
+# pylint: disable=line-too-long, unused-variable, unused-import
 from typing import Optional, Union  # noqa
 from pathlib import Path
 import cchardet
     if not filepath.is_file():
         logger.error(" file [%s] does not exist or is not a file.", filepath)
         # return None
+        raise Exception(" file [{filepath}] does not exist or is not a file.")
     # encoding = detect_file(filepath)
     encoding = cchardet.detect(filepath.read_bytes()).get("encoding", "utf8")
     # cchardet: 'GB18030', no need for errors="ignore"
     try:
+        text = filepath.read_text(encoding=encoding, errors="ignore")
     except Exception as exc:
         logger.error(" Opening %s resulted in errors: %s", filepath, exc)
         raise
 def test1():
+    r"""Tests default file.
     defaultdir = r'D:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
     defaultfile = r'notes pyqt tkinter tktable.txt'
 def testgb():
+    r"""Tests shuangyu_ku\txt-books\19部世界名著中英文对照版TXT."""
+    file = (
+        r"C:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT"
+        r"\爱丽丝漫游奇境记.txt"
+    )
     text = loadtext(file)
     if text:
         # assert len(text) == 190913
         assert text0 == text[:500]
+def test_utf_16le():
+    r"""Test  'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'."""
     # file = 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'  # NOQA
     file = r"C:\dl\Dropbox\mat-dir\snippets-mat\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"  # NOQA
     file = r"C:\dl\Dropbox\mat-dir\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"

radiobee/mdx_e2c.py CHANGED Viewed

@@ -3,6 +3,7 @@
 mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
 mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
 """
 from pathlib import Path
 from string import punctuation
 import joblib

 mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
 mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
 """
+# pylint: disable=invalid-name,
 from pathlib import Path
 from string import punctuation
 import joblib

radiobee/model_s.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Load model_s."""
+# pylint: disable=invalid-name
+from pathlib import Path
+import joblib
+from huggingface_hub import hf_hub_url, cached_download  # hf_hub_download,
+from alive_progress import alive_bar
+from logzero import logger
+def load_model_s():
+    """Load local model_s if present, else fetch from hf.co."""
+    file_loc = "radiobee/model_s"
+    if Path(file_loc).exists():
+        # raise Exception(f"File {file_loc} does not exist.")
+        with alive_bar(1, title=" Loading model_s, takes ~30 secs ...", length=3) as progress_bar:
+            model = joblib.load(file_loc)
+            # model_s = pickle.load(open(file_loc, "rb"))
+            progress_bar()  # pylint: disable=not-callable
+        return model
+    logger.info(
+        "Fetching and caching model_s from huggingface.co... "
+        "The first time may take a while depending on your net."
+    )
+    with alive_bar(1, title=" Subsequent loading takes ~20 secs ...", length=3) as progress_bar:
+        model = joblib.load(cached_download(hf_hub_url("mikeee/model_s", "model_s")))
+        progress_bar()  # pylint: disable=not-callable
+    return model
+model_s = load_model_s()

radiobee/paras2sents.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Convert paras to sents."""
+# pylint: disable=unused-import, too-many-branches, ungrouped-imports
+from typing import Callable, List, Optional, Tuple, Union
+from itertools import zip_longest
+import numpy as np
+import pandas as pd
+from logzero import logger
+from radiobee.align_sents import align_sents
+from radiobee.seg_text import seg_text
+from radiobee.detect import detect
+try:
+    from radiobee.shuffle_sents import shuffle_sents
+except Exception as exc:
+    logger.error("shuffle_sents not available: %s, using align_sents", exc)
+    shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2)  # noqa
+def paras2sents(
+    paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
+    align_func: Optional[Union[Callable, str]] = None,
+    lang1: Optional[str] = None,
+    lang2: Optional[str] = None,
+) -> List[Tuple[str, str, Union[str, float]]]:
+    """Convert paras to sents using align_func.
+    Args:
+        paras_: list of 3-tuples or numpy or pd.DataFrame
+        lang1: fisrt lang code
+        lang2: second lang code
+        align_func: func used in the sent level
+            if set to None, default to align_sents
+    Returns:
+        list of sents (possible with likelihood for shuffle_sents)
+    """
+    # wrap everything in pd.DataFrame
+    # necessary to make pyright happy
+    paras = pd.DataFrame(paras_).fillna("")
+    # take the first three columns at maximum
+    paras = paras.iloc[:, :3]
+    if len(paras.columns) < 2:
+        logger.error(
+            "Need at least two columns, got %s",
+            len(paras.columns)
+        )
+        raise Exception("wrong data")
+    # append the third col (all "") if there are only two cols
+    if len(paras.columns) < 3:
+        paras.insert(2, "likelihood", [""] * len(paras))
+    if lang1 is None:
+        lang1 = detect(" ".join(paras.iloc[:, 0]))
+    if lang2 is None:
+        lang2 = detect(" ".join(paras.iloc[:, 1]))
+    left, right = [], []
+    row0, row1 = [], []
+    for elm0, elm1, elm2 in paras.values:
+        sents0 = seg_text(elm0, lang1)
+        sents1 = seg_text(elm1, lang2)
+        if isinstance(elm2, float) and elm2 > 0:
+            if row0 or row1:
+                left.append(row0)
+                right.append(row1)
+            row0, row1 = [], []  # collect and prepare
+            if sents0:
+                left.append(sents0)
+            if sents1:
+                right.append(sents1)
+        else:
+            if sents0:
+                row0.extend(sents0)
+            if sents1:
+                row1.extend(sents1)
+    # collect possible last batch
+    if row0 or row1:
+        left.append(row0)
+        right.append(row1)
+    # res = [*zip(left, right)]
+    # align each batch using align_func
+    # ready align_func
+    if align_func is None:
+        align_func = align_sents
+    if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
+        align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2)  # noqa
+    else:
+        align_func = align_sents
+    res = []
+    for row0, row1 in zip(left, right):
+        try:
+            _ = align_func(row0, row1)
+        except Exception as exc:
+            logger.error("errors: %s, resorting to zip_longest", exc)
+            _ = [*zip_longest(row0, row1, fillvalue="")]
+        # res.append(_)
+        res.extend(_)
+    return res

radiobee/plot_cmat.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Plot pandas.DataFrame with DBSCAN clustering."""
-# pylint: disable=invalid-name, too-many-arguments
 import numpy as np
 import pandas as pd
 import matplotlib
@@ -37,13 +38,13 @@ def plot_cmat(
         backend: str = "Agg",
         showfig: bool = False,
 ):
-    # ) -> plt:
     # fmt: on
     """Plot df with DBSCAN clustering.
     Args:
         df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
-    Returns:
         matplotlib.pyplot: for possible use in gradio
     plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))

 """Plot pandas.DataFrame with DBSCAN clustering."""
+# pylint: disable=invalid-name, too-many-arguments, too-many-locals
 import numpy as np
 import pandas as pd
 import matplotlib
         backend: str = "Agg",
         showfig: bool = False,
 ):
     # fmt: on
     """Plot df with DBSCAN clustering.
     Args:
         df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
+    Returns
         matplotlib.pyplot: for possible use in gradio
     plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))

radiobee/plot_df.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Plot pandas.DataFrame with DBSCAN clustering."""
-# pylint: disable=invalid-name, too-many-arguments
 import numpy as np  # noqa
 import pandas as pd
 import matplotlib
@@ -38,6 +38,7 @@ def plot_df(
     Args:
         df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
     Returns:
         matplotlib.pyplot: for possible use in gradio

 """Plot pandas.DataFrame with DBSCAN clustering."""
+# pylint: disable=invalid-name, too-many-arguments, unused-import
 import numpy as np  # noqa
 import pandas as pd
 import matplotlib
     Args:
         df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
     Returns:
         matplotlib.pyplot: for possible use in gradio

radiobee/process_upload.py CHANGED Viewed

@@ -1,4 +1,5 @@
 """Process uploads."""
 from typing import Union
 from pathlib import Path
@@ -51,7 +52,7 @@ def process_upload(upload: Union[tempfile._TemporaryFileWrapper, bytes]) -> str:
     if encoding is not None:
         try:
-            text = fpath.read_text(encoding)
         except Exception as e:
             logger.error("Unable to retrieve text, error: %s", e)
             text = str(e)
@@ -63,7 +64,7 @@ def process_upload(upload: Union[tempfile._TemporaryFileWrapper, bytes]) -> str:
     # not able to cchardet: encoding is None, docx, pdf, epub, zip etc
     logger.info("Trying docx...to be implemented")
-    # TODO
     _ = Path(upload.name)
     msg = f"binary file: {_.stem[:-8]}{_.suffix}"

 """Process uploads."""
+# pylint: disable=invalid-name, unused-import
 from typing import Union
 from pathlib import Path
     if encoding is not None:
         try:
+            text = fpath.read_text(encoding=encoding)
         except Exception as e:
             logger.error("Unable to retrieve text, error: %s", e)
             text = str(e)
     # not able to cchardet: encoding is None, docx, pdf, epub, zip etc
     logger.info("Trying docx...to be implemented")
+    # T ODO .docx .epub .mobi .pdf etc.
     _ = Path(upload.name)
     msg = f"binary file: {_.stem[:-8]}{_.suffix}"

radiobee/seg_text.py CHANGED Viewed

@@ -97,6 +97,8 @@ def seg_text(
     Arguments:
         lst: text or text list
         extra: re.split(rf"{extra}, text) first
     Returns:
         list of splitted text.

     Arguments:
         lst: text or text list
+        lang: optional lang code
+        maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
         extra: re.split(rf"{extra}, text) first
     Returns:
         list of splitted text.

radiobee/shuffle_sents.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """Shuffle sents."""
-# pylint: disable=
 from typing import List, Optional, Tuple, Union
 from fastlid import fastlid
 from logzero import logger  # noqa
@@ -26,12 +27,23 @@ def shuffle_sents(
         lang2: Optional[str] = None,
 ) -> List[Tuple[str, str, Union[str, float]]]:
     # fmt: on
-    """shuffle sents to the right positions.
     Based on __main__.py.
     """
     set_languages = fastlid.set_languages
-    fastlid.set_languages = ["en", "zh"]
     if lang1 is None:
         lang1, _ = fastlid(" ".join(lst1))
     if lang2 is None:
@@ -40,16 +52,28 @@ def shuffle_sents(
     # restore fastlid.set_languages
     fastlid.set_languages = set_languages
-    cmat = lists2cmat(
-        lst1,
-        lst2,
-        tf_type=tf_type,
-        idf_type=idf_type,
-        dl_type=dl_type,
-        norm=norm,
-        lang1=lang1,
-        lang2=lang2,
-    )
     pset = gen_pset(
         cmat,
@@ -63,6 +87,11 @@ def shuffle_sents(
     final_list = align_texts(aset, lst2, lst1)
-    return final_list
-    # return [("", "")]

 """Shuffle sents."""
+# pylint: disable=unused-import, too-many-arguments, too-many-locals,
 from typing import List, Optional, Tuple, Union
+import pandas as pd
 from fastlid import fastlid
 from logzero import logger  # noqa
         lang2: Optional[str] = None,
 ) -> List[Tuple[str, str, Union[str, float]]]:
     # fmt: on
+    """Shuffle sents to the right positions.
     Based on __main__.py.
+    eps: float = 6
+    min_samples: int = 4
+    tf_type: str = "linear"
+    idf_type: Optional[str] = None
+    dl_type: Optional[str] = None
+    norm: Optional[str] = None
+    lang1: Optional[str] = "en"
+    lang2: Optional[str] = "zh"
     """
     set_languages = fastlid.set_languages
+    # fastlid.set_languages = ["en", "zh"]
+    fastlid.set_languages = None
     if lang1 is None:
         lang1, _ = fastlid(" ".join(lst1))
     if lang2 is None:
     # restore fastlid.set_languages
     fastlid.set_languages = set_languages
+    lang_dicts = ["en", "zh"]
+    if lang1 in lang_dicts and lang2 in lang_dicts:
+        cmat = lists2cmat(
+            lst1,
+            lst2,
+            tf_type=tf_type,
+            idf_type=idf_type,
+            dl_type=dl_type,
+            norm=norm,
+            lang1=lang1,
+            lang2=lang2,
+        )
+    else:  # use model_s
+        from radiobee.model_s import model_s  # pylint: disable=import-outside-toplevel
+        vec1 = model_s.encode(lst1)
+        vec2 = model_s.encode(lst2)
+        # cmat = vec1.dot(vec2.T)
+        cmat = vec2.dot(vec1.T)
+    shuffle_sents.cmat = cmat
+    shuffle_sents.lang1 = lang1
+    shuffle_sents.lang2 = lang2
     pset = gen_pset(
         cmat,
     final_list = align_texts(aset, lst2, lst1)
+    # return final_list
+    # swap columns 0, 1
+    _ = pd.DataFrame(final_list)
+    _ = _.iloc[:, [1, 0] + [*range(2, _.shape[1])]]
+    return _.to_numpy().tolist()

radiobee/smatrix.py CHANGED Viewed

@@ -3,13 +3,16 @@
 refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
 originally docterm_scores.py.
 """
 from typing import Dict, Iterable, Optional, Union
-import numpy as np
 from itertools import chain
 from psutil import virtual_memory
 from more_itertools import ilen
 from textacy.representations import Vectorizer
 # from textacy.representations.vectorizers import Vectorizer
 from logzero import logger
@@ -51,8 +54,8 @@ def smatrix(
         for xelm in iter(doc1):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
-    except AssertionError:
-        raise AssertionError(" doc1 is not of the typing  Iterable[Iterable[str]] ")
     except Exception as e:
         logger.error(e)
         raise
@@ -60,8 +63,8 @@ def smatrix(
         for xelm in iter(doc2):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
-    except AssertionError:
-        raise AssertionError(" doc2 is not of the typing  Iterable[Iterable[str]] ")
     except Exception as e:
         logger.error(e)
         raise

 refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
 originally docterm_scores.py.
 """
+# pylint: disable=invalid-name, too-many-locals, too-many-arguments
 from typing import Dict, Iterable, Optional, Union
 from itertools import chain
+import numpy as np
 from psutil import virtual_memory
 from more_itertools import ilen
 from textacy.representations import Vectorizer
 # from textacy.representations.vectorizers import Vectorizer
 from logzero import logger
         for xelm in iter(doc1):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
+    except AssertionError as exc:
+        raise AssertionError(" doc1 is not of the typing  Iterable[Iterable[str]] ") from exc
     except Exception as e:
         logger.error(e)
         raise
         for xelm in iter(doc2):
             for elm in iter(xelm):
                 assert isinstance(elm, str)
+    except AssertionError as exc:
+        raise AssertionError(" doc2 is not of the typing  Iterable[Iterable[str]] ") from exc
     except Exception as e:
         logger.error(e)
         raise

radiobee/text2lists.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Separate text to zh en lists."""
-# pylint: disable=
 # from typing import Tuple,
 from typing import Iterable, List, Optional, Tuple, Union  # noqa

 """Separate text to zh en lists."""
+# pylint: disable=unused-import, too-many-locals, invalid-name, too-many-branches, too-many-statements,
 # from typing import Tuple,
 from typing import Iterable, List, Optional, Tuple, Union  # noqa

requirements.txt CHANGED Viewed

@@ -24,4 +24,6 @@ pycld2
 tqdm
 polyglot
 sentence_splitter
-icecream

 tqdm
 polyglot
 sentence_splitter
+icecream
+# lazy
+alive-progress

run-pydocstle.bat ADDED Viewed

	@@ -0,0 +1 @@


1	+ pydocstyle --convention=google radiobee tests

run-pylint.bat ADDED Viewed

	@@ -0,0 +1 @@


1	+ pylint radiobee -d duplicate-code

tests/test_align_sents.py CHANGED Viewed

@@ -1,9 +1,14 @@
 """Test align_sents."""
 from radiobee.align_sents import align_sents
-def test_align_sents():
-    """Test align_sents."""
     lst1, lst2 = [
         "a",
         "bs",
@@ -11,3 +16,56 @@ def test_align_sents():
     res = align_sents(lst1, lst2)
     assert res == [("a", "aaa"), ("a", "34"), ("bs", "a"), ("bs", "b")]

 """Test align_sents."""
 from radiobee.align_sents import align_sents
+from radiobee.seg_text import seg_text
+text1 = """`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn."""
+text2 = """“被囚禁的囚犯!”我在精神上被射精,“你应该永远与你的物种隔绝,因为你这种粗鲁的病态。至少,我白天不会锁门,我不在乎,我进去了!”我决心如此,我抓住了门锁,狠狠地摇了一下。醋脸的约瑟夫从谷仓的圆窗朝他的头照射。"""
+text3 = """"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit. Zumindest würde ich meine Türen tagsüber nicht verriegeln. Das ist mir egal - ich werde reinkommen!' So entschlossen, ergriff ich die Klinke und rüttelte heftig daran. Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune."""
+def test_align_sents_sanity():
+    """Test align_sents sanity check."""
     lst1, lst2 = [
         "a",
         "bs",
     res = align_sents(lst1, lst2)
     assert res == [("a", "aaa"), ("a", "34"), ("bs", "a"), ("bs", "b")]
+def test_align_sents_en_zh():
+    """Test align_sents en-zh."""
+    sents_en = seg_text(text1)
+    sents_zh = seg_text(text2)
+    # 9ms vs shuffle_sents 50ms shuffle_sents wth lang1lang2 40ms
+    res = align_sents(sents_en, sents_zh)
+    _ = """res[2:4]
+    Out[26]:
+    [('At least, I would not keep my doors barred in the day time.',
+      '至少,我白天不会锁门,我不在乎,我进去了!”'),
+     ("I don't care--I will get in!'", '至少,我白天不会锁门,我不在乎,我进去了!”')]
+    """
+    assert "至少" in str(res[2])
+    assert "至少" in str(res[3])
+def test_align_sents_en_de():
+    """Test align_sents en-zh."""
+    sents_en = seg_text(text1)
+    sents_de = seg_text(text3)
+    res1 = align_sents(sents_en, sents_de)
+    _ = """In [48]: res1[:2]
+    Out[48]:
+    [("`Wretched inmates!'",
+      '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
+     ('I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
+      '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.')]
+    """
+    assert "Elende" in str(res1[0])
+    assert "Elende" in str(res1[1])
+_ = """
+[("`Wretched inmates!'",
+  '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
+ ('I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
+  '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
+ ('At least, I would not keep my doors barred in the day time.',
+  'Zumindest würde ich meine Türen tagsüber nicht verriegeln.'),
+ ("I don't care--I will get in!'",
+  "Das ist mir egal - ich werde reinkommen!'"),
+ ('So resolved, I grasped the latch and shook it vehemently.',
+  'So entschlossen, ergriff ich die Klinke und rüttelte heftig daran.'),
+ ('Vinegar-faced Joseph projected his head from a round window of the barn.',
+  'Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune.')]
+"""

tests/test_lists2cmat_hlm.py CHANGED Viewed

@@ -37,9 +37,9 @@ def test_lists2cmat_hlm():
     # cmat = texts2cmat(lst1, lst2, lang1, lang2)
     cmat = lists2cmat(lst1, lst2, lang1, lang2)
-    assert cmat.shape == (36, 33)
     cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
-    assert cmat21.shape == (33, 36)
     assert lists2cmat(lst2, lst1).mean() > 0.05  # 0.09

     # cmat = texts2cmat(lst1, lst2, lang1, lang2)
     cmat = lists2cmat(lst1, lst2, lang1, lang2)
+    assert cmat.shape == (55, 135)
     cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
+    assert cmat21.shape == (135, 55)
     assert lists2cmat(lst2, lst1).mean() > 0.05  # 0.09

tests/test_paras2sents.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""Test paras2sents."""
+# pylint: disable=invalid-name
+import pandas as pd
+from radiobee.paras2sents import paras2sents
+from radiobee.shuffle_sents import shuffle_sents
+file_loc = r"data/test-dual-zh-en.xlsx"
+paras = pd.read_excel(file_loc, header=0)
+paras = paras[["text1", "text2", "likelihood"]].fillna("")
+def test_paras2sents_dual():
+    """Test paras2sents_dual."""
+    sents = paras2sents(paras)
+    assert len(sents) > 202  # 208
+    # assert not sents
+def test_paras2sents_dual_model_s():
+    """Test paras2sents_dual_model_s."""
+    sents = paras2sents(paras, shuffle_sents)
+    assert len(sents) > 201  # 207
+    # assert not sents
+_ = """
+df = pd.DataFrame(
+    [list(sent) + [""] if len(sent) == 2 else list(sent) for sent in sents]
+).fillna("")
+"""

tests/test_shuffle_sents.py ADDED Viewed

	@@ -0,0 +1,136 @@

+"""Test shuffle_sents.
+    eps: float = 6
+    min_samples: int = 4
+    tf_type: str = "linear"
+    idf_type: Optional[str] = None
+    dl_type: Optional[str] = None
+    norm: Optional[str] = None
+    lang1: Optional[str] = "en"
+    lang2: Optional[str] = "zh"
+"""
+from radiobee.seg_text import seg_text
+from radiobee.shuffle_sents import shuffle_sents
+from radiobee.align_sents import align_sents
+text1 = """`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn."""
+text2 = """“被囚禁的囚犯!”我在精神上被射精,“你应该永远与你的物种隔绝,因为你这种粗鲁的病态。至少,我白天不会锁门,我不在乎,我进去了!”我决心如此,我抓住了门锁,狠狠地摇了一下。醋脸的约瑟夫从谷仓的圆窗朝他的头照射。"""
+text3 = """"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit. Zumindest würde ich meine Türen tagsüber nicht verriegeln. Das ist mir egal - ich werde reinkommen!' So entschlossen, ergriff ich die Klinke und rüttelte heftig daran. Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune."""
+def test_shuffle_sents_en_zh():
+    """Test shuffle_sents_en_zh."""
+    sents_en = seg_text(text1)
+    sents_zh = seg_text(text2)
+    lang1 = "en"
+    lang2 = "zh"
+    pairs = shuffle_sents(sents_en, sents_zh)
+    pairs_ = shuffle_sents(sents_en, sents_zh, lang1=lang1, lang2=lang2)
+    # pairs[3] == ('', "I don't care--I will get in!'", '')
+    assert pairs == pairs_
+    # assert not pairs[3][0]
+    # after swapping
+    assert not pairs[3][1]
+def test_shuffle_sents_en_de():
+    """Test shuffle_sents_en_de."""
+    sents_en = seg_text(text1)
+    sents_de = seg_text(text3)
+    lang1 = "en"
+    lang2 = "de"
+    pairs = shuffle_sents(sents_en, sents_de)
+    pairs_ = shuffle_sents(sents_en, sents_de, lang1=lang1, lang2=lang2)
+    assert pairs == pairs_
+    #
+    # assert not pairs[3][0]
+    _ = """In [218]: pairs[:2]
+    Out[218]:
+    [["`Wretched inmates!'", '', ''],
+     ['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
+      '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
+      0.62]]
+    """
+    assert not pairs[0][1]
+    assert "mentally" in str(pairs[1]) and "Elende" in str(pairs[1])
+    # [elm[2] for elm in pairs]
+    # ['', 0.62, 0.72, 0.74, 0.68, 0.79]
+    if isinstance(pairs[1][2], float):
+        assert pairs[1][2] > 0.6
+    if isinstance(pairs[2][2], float):
+        assert pairs[2][2] > 0.7
+    if isinstance(pairs[3][2], float):
+        assert pairs[3][2] > 0.7
+    if isinstance(pairs[4][2], float):
+        assert pairs[4][2] > 0.6
+    if isinstance(pairs[5][2], float):
+        assert pairs[5][2] > 0.7
+_ = """
+In [232]: shuffle_sents.cmat.round(2)
+Out[232]:
+array([[ 0.27,  0.62,  0.07,  0.11,  0.02,  0.02],
+       [ 0.03,  0.09,  0.72,  0.18,  0.07, -0.07],
+       [ 0.19,  0.07,  0.16,  0.74, -0.01, -0.02],
+       [-0.02,  0.18,  0.16,  0.06,  0.68, -0.04],
+       [ 0.02,  0.07,  0.04, -0.04,  0.02,  0.79]], dtype=float32)
+pairs[1]
+sents_en[1], sents_de[0], shuffle_sents.cmat[0, 1]
+['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
+ 0.62]
+pairs[2]
+sents_en[2], sents_de[1], shuffle_sents.cmat[1, 2].round(2)
+Out[244]:
+('At least, I would not keep my doors barred in the day time.',
+ 'Zumindest würde ich meine Türen tagsüber nicht verriegeln.',
+ 0.72)
+...
+import mtplotlib
+import matplotlib.pyplot as plt
+import seaborn as sns
+sns.set()
+set_style("darkgrind")
+plt.ion()
+ali = shuffle_sents(sents_en, sents_de)
+sns.heatmap(shuffle_sents.cmat, cmap="viridis_r").invert_yaxis()
+ax = plt.gca()
+ax.set_xlabel(shuffle_sents.lang1)
+ax.set_ylabel(shuffle_sents.lang2)
+ali == [["`Wretched inmates!'", '', ''],
+ ['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
+  '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
+  0.62],
+ ['At least, I would not keep my doors barred in the day time.',
+  'Zumindest würde ich meine Türen tagsüber nicht verriegeln.',
+  0.72],
+ ["I don't care--I will get in!'",
+  "Das ist mir egal - ich werde reinkommen!'",
+  0.74],
+ ['So resolved, I grasped the latch and shook it vehemently.',
+  'So entschlossen, ergriff ich die Klinke und rüttelte heftig daran.',
+  0.68],
+ ['Vinegar-faced Joseph projected his head from a round window of the barn.',
+  'Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune.',
+  0.79]]
+res1 = align_sents(sents_en, sents_de)
+ali = shuffle_sents(sents_en, sents_de)
+for idx in range(1, 6):
+    assert res1[idx] == tuple(ali[idx][:2])
+"""