freemt commited on
Commit
4c04f50
1 Parent(s): 02e4e96

Update before sent-align

Browse files
radiobee/__main__.py CHANGED
@@ -1,9 +1,9 @@
1
  """Run interactively."""
2
- # pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, wrong-import-position, too-many-locals, too-many-statements
3
  from typing import Any, Tuple, Optional, Union # noqa
4
 
5
  import sys
6
- from pathlib import Path
7
  import platform
8
  import signal
9
  from random import randint
@@ -377,8 +377,6 @@ if __name__ == "__main__":
377
  """
378
  ).strip()
379
 
380
- # "<p style='text-align: center'><a href='https://arxiv.org/abs/2112.11641' target='_blank'>JoJoGAN: One Shot Face Stylization</a>| <a href='https://github.com/mchong6/JoJoGAN' target='_blank'>Github Repo Pytorch</a></p> <center><img src='https://visitor-badge.glitch.me/badge?page_id=akhaliq_jojogan' alt='visitor badge'></center> <p style='text-align: center'>samples from repo: <img src='https://raw.githubusercontent.com/mchong6/JoJoGAN/main/teaser.jpg' alt='animation'/></p>" # noqa
381
-
382
  article = dedent(
383
  """ <p style="text-align: center">readiobee docs:
384
  <a href="https://radiobee.readthedocs.io/" target="_blank">readthedocs</a>
 
1
  """Run interactively."""
2
+ # pylint: disable=invalid-name, too-many-arguments, unused-argument, redefined-builtin, unused-import, wrong-import-position, too-many-locals, too-many-statements
3
  from typing import Any, Tuple, Optional, Union # noqa
4
 
5
  import sys
6
+ from pathlib import Path # noqa
7
  import platform
8
  import signal
9
  from random import randint
 
377
  """
378
  ).strip()
379
 
 
 
380
  article = dedent(
381
  """ <p style="text-align: center">readiobee docs:
382
  <a href="https://radiobee.readthedocs.io/" target="_blank">readthedocs</a>
radiobee/align_sents.py CHANGED
@@ -1,5 +1,5 @@
1
  """Align sents via gale-church."""
2
- # pylint: disable=
3
 
4
  from typing import List, Tuple # noqa
5
 
@@ -38,8 +38,10 @@ def align_sents(lst1: List[str], lst2: List[str]) -> List[Tuple[str, str]]:
38
 
39
  texts = []
40
  # for elm in aset:
41
- for elm0, elm1 in amended_avec:
 
42
  # elm0, elm1, elm2 = elm
 
43
  _ = []
44
 
45
  # src_text first
 
1
  """Align sents via gale-church."""
2
+ # pylint: disable=invalid-name
3
 
4
  from typing import List, Tuple # noqa
5
 
 
38
 
39
  texts = []
40
  # for elm in aset:
41
+ # for elm0, elm1 in amended_avec:
42
+ for elm in amended_avec:
43
  # elm0, elm1, elm2 = elm
44
+ elm0, elm1 = elm[:2]
45
  _ = []
46
 
47
  # src_text first
radiobee/align_texts.py CHANGED
@@ -1,4 +1,6 @@
1
  """Align texts based on aset, src_text, tgt_text."""
 
 
2
  from typing import List, Tuple, Union
3
  from logzero import logger
4
 
 
1
  """Align texts based on aset, src_text, tgt_text."""
2
+ # pylint: disable=unused-variable
3
+
4
  from typing import List, Tuple, Union
5
  from logzero import logger
6
 
radiobee/amend_avec.py CHANGED
@@ -1,5 +1,5 @@
1
  """Amend avec from align_block."""
2
- # pylint: disable=
3
 
4
  from typing import List, Tuple, Union
5
 
 
1
  """Amend avec from align_block."""
2
+ # pylint: disable=unused-variable, unused-import
3
 
4
  from typing import List, Tuple, Union
5
 
radiobee/app.py CHANGED
@@ -1,4 +1,6 @@
1
  """Talk to spaces VM via subprocess.check_output."""
 
 
2
  # import httpx
3
  import subprocess as sp
4
  from shlex import split
 
1
  """Talk to spaces VM via subprocess.check_output."""
2
+ # pylint: disable=unused-variable, invalid-name
3
+
4
  # import httpx
5
  import subprocess as sp
6
  from shlex import split
radiobee/cmat2tset.py CHANGED
@@ -1,4 +1,6 @@
1
  """Gen triple-set from a matrix."""
 
 
2
  from typing import List, Tuple, Union # noqa
3
 
4
  import numpy as np
 
1
  """Gen triple-set from a matrix."""
2
+ # pylint: disable=unused-import
3
+
4
  from typing import List, Tuple, Union # noqa
5
 
6
  import numpy as np
radiobee/docterm_scores.py CHANGED
@@ -2,9 +2,11 @@
2
 
3
  refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
4
  """
 
 
5
  from typing import Dict, Iterable, List, Optional, Union # noqa
6
- import numpy as np
7
  from itertools import chain
 
8
  from psutil import virtual_memory
9
  from more_itertools import ilen
10
 
@@ -48,8 +50,8 @@ def docterm_scores(
48
  for xelm in iter(doc1):
49
  for elm in iter(xelm):
50
  assert isinstance(elm, str)
51
- except AssertionError:
52
- raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ")
53
  except Exception as e:
54
  logger.error(e)
55
  raise
@@ -57,8 +59,8 @@ def docterm_scores(
57
  for xelm in iter(doc2):
58
  for elm in iter(xelm):
59
  assert isinstance(elm, str)
60
- except AssertionError:
61
- raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ")
62
  except Exception as e:
63
  logger.error(e)
64
  raise
 
2
 
3
  refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
4
  """
5
+ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, unused-import
6
+
7
  from typing import Dict, Iterable, List, Optional, Union # noqa
 
8
  from itertools import chain
9
+ import numpy as np
10
  from psutil import virtual_memory
11
  from more_itertools import ilen
12
 
 
50
  for xelm in iter(doc1):
51
  for elm in iter(xelm):
52
  assert isinstance(elm, str)
53
+ except AssertionError as exc:
54
+ raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") from exc
55
  except Exception as e:
56
  logger.error(e)
57
  raise
 
59
  for xelm in iter(doc2):
60
  for elm in iter(xelm):
61
  assert isinstance(elm, str)
62
+ except AssertionError as exc:
63
+ raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") from exc
64
  except Exception as e:
65
  logger.error(e)
66
  raise
radiobee/en2zh.py CHANGED
@@ -5,7 +5,9 @@ from typing import Iterable, List, Union
5
  import warnings
6
 
7
  import copy
8
- from radiobee.mdx_e2c import mdx_e2c
 
 
9
 
10
  warnings.simplefilter('ignore', DeprecationWarning)
11
 
@@ -25,6 +27,9 @@ def en2zh(
25
  Returns
26
  res: list of str
27
  """
 
 
 
28
  res = copy.deepcopy(text)
29
  if isinstance(text, str):
30
  # res = [text.split()]
 
5
  import warnings
6
 
7
  import copy
8
+
9
+ # from radiobee.mdx_e2c import mdx_e2c # moved to local for lazy loading
10
+ # from lazy import lazy
11
 
12
  warnings.simplefilter('ignore', DeprecationWarning)
13
 
 
27
  Returns
28
  res: list of str
29
  """
30
+ # to effect lazy loading
31
+ from radiobee.mdx_e2c import mdx_e2c # pylint: disable=import-outside-toplevel
32
+
33
  res = copy.deepcopy(text)
34
  if isinstance(text, str):
35
  # res = [text.split()]
radiobee/error_msg.py CHANGED
@@ -1,4 +1,6 @@
1
  """Prepare an error message for gradiobee."""
 
 
2
  from typing import Optional, Tuple, Union
3
  import pandas as pd
4
 
 
1
  """Prepare an error message for gradiobee."""
2
+ # pylint: disable=invalid-name
3
+
4
  from typing import Optional, Tuple, Union
5
  import pandas as pd
6
 
radiobee/files2df.py CHANGED
@@ -1,4 +1,6 @@
1
  """Convert two iesl to pandas.DataFrame."""
 
 
2
  from itertools import zip_longest
3
  # import tempfile
4
  import pandas as pd
 
1
  """Convert two iesl to pandas.DataFrame."""
2
+ # pylint: disable=invalid-name
3
+
4
  from itertools import zip_longest
5
  # import tempfile
6
  import pandas as pd
radiobee/gen_aset.py CHANGED
@@ -1,4 +1,6 @@
1
  """Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
 
 
2
  from typing import List, Tuple, Union
3
  from itertools import zip_longest
4
 
 
1
  """Genereat align set (aset) based on pset (pair set), src_lang and tgt_len."""
2
+ # pylint: disable=unused-variable
3
+
4
  from typing import List, Tuple, Union
5
  from itertools import zip_longest
6
 
radiobee/gen_eps_minsamples.py CHANGED
@@ -4,10 +4,13 @@
4
  def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
5
  """Gen suggested eps min_samples."""
6
  eps = src_len * 0.01
7
- if eps < 3:
8
- eps = 3
 
9
 
10
  min_samples = tgt_len / 100 * 0.5
11
- if min_samples < 3:
12
- min_samples = 3
 
 
13
  return {"eps": eps, "min_samples": min_samples}
 
4
  def gen_eps_minsamples(src_len: int, tgt_len: int) -> dict:
5
  """Gen suggested eps min_samples."""
6
  eps = src_len * 0.01
7
+
8
+ # if eps < 3: eps = 3
9
+ eps = max(3, eps)
10
 
11
  min_samples = tgt_len / 100 * 0.5
12
+
13
+ # if min_samples < 3: min_samples = 3
14
+ min_samples = max(3, min_samples)
15
+
16
  return {"eps": eps, "min_samples": min_samples}
radiobee/gen_model.py CHANGED
@@ -8,6 +8,8 @@ doc_term_matrix
8
 
9
  tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
10
  """
 
 
11
  from typing import Dict, Iterable, List, Optional, Union # noqa
12
 
13
  from textacy.representations import Vectorizer
@@ -30,16 +32,13 @@ def gen_model(
30
  """Generate a model (textacy.representations.Vectorizer).
31
 
32
  Args:
33
- doc: tokenized docs
34
-
35
  (refer to textacy.representation.Vectorizer)
36
  tf_type: Type of term frequency (tf) to use for weights' local component:
37
-
38
  - "linear": tf (tfs are already linear, so left as-is)
39
  - "sqrt": tf => sqrt(tf)
40
  - "log": tf => log(tf) + 1
41
  - "binary": tf => 1
42
-
43
  idf_type: Type of inverse document frequency (idf) to use for weights'
44
  global component:
45
 
@@ -91,8 +90,8 @@ def gen_model(
91
  for xelm in iter(tokenized_docs):
92
  for elm in iter(xelm):
93
  assert isinstance(elm, str)
94
- except AssertionError:
95
- raise AssertionError(" tokenized_docs is not of the typing Iterable[Iterable[str]] ")
96
  except Exception as e:
97
  logger.error(e)
98
  raise
 
8
 
9
  tokenized_docs = [insert_spaces(elm).split() for elm in textzh]
10
  """
11
+ # pylint: disable=too-many-arguments, invalid-name, unused-import
12
+
13
  from typing import Dict, Iterable, List, Optional, Union # noqa
14
 
15
  from textacy.representations import Vectorizer
 
32
  """Generate a model (textacy.representations.Vectorizer).
33
 
34
  Args:
35
+ tokenized_docs: tokenized docs
 
36
  (refer to textacy.representation.Vectorizer)
37
  tf_type: Type of term frequency (tf) to use for weights' local component:
 
38
  - "linear": tf (tfs are already linear, so left as-is)
39
  - "sqrt": tf => sqrt(tf)
40
  - "log": tf => log(tf) + 1
41
  - "binary": tf => 1
 
42
  idf_type: Type of inverse document frequency (idf) to use for weights'
43
  global component:
44
 
 
90
  for xelm in iter(tokenized_docs):
91
  for elm in iter(xelm):
92
  assert isinstance(elm, str)
93
+ except AssertionError as e:
94
+ raise AssertionError(" tokenized_docs is not of the typing Iterable[Iterable[str]] ") from e
95
  except Exception as e:
96
  logger.error(e)
97
  raise
radiobee/gen_pset.py CHANGED
@@ -2,6 +2,8 @@
2
 
3
  tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
4
  """
 
 
5
  from typing import List, Tuple, Union
6
 
7
  import numpy as np
@@ -22,6 +24,7 @@ def _gen_pset(
22
  # ) -> List[Tuple[int, int, Union[float, str]]]:
23
  ) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
24
  """Gen pset from cmat.
 
25
  Find pairs for a given cmat.
26
 
27
  Args:
@@ -86,8 +89,9 @@ def _gen_pset(
86
  # low_ = np.min(ymax) - 1 # reset to minimum_value - 1
87
 
88
  buff = [(-1, -1, ""), (tgt_len, src_len, "")]
89
- # for _ in range(tgt_len):
90
- for idx, tset_elm in enumerate(tset):
 
91
  logger.debug("buff: %s", buff)
92
  # postion max in ymax and insert in buff
93
  # if with range given by iset+-delta and
@@ -152,6 +156,7 @@ def gen_pset(
152
 
153
  Refer to _gen_pset.
154
  """
 
155
  gen_pset.min_samples = min_samples
156
  for min_s in range(min_samples):
157
  logger.debug(" min_samples, try %s", min_samples - min_s)
 
2
 
3
  tinybee.find_pairs.py with fixed estimator='dbscan' eps=eps, min_samples=min_samples
4
  """
5
+ # pylint: disable=too-many-locals, unused-import, invalid-name
6
+
7
  from typing import List, Tuple, Union
8
 
9
  import numpy as np
 
24
  # ) -> List[Tuple[int, int, Union[float, str]]]:
25
  ) -> List[Tuple[Union[float, str], Union[float, str], Union[float, str]]]:
26
  """Gen pset from cmat.
27
+
28
  Find pairs for a given cmat.
29
 
30
  Args:
 
89
  # low_ = np.min(ymax) - 1 # reset to minimum_value - 1
90
 
91
  buff = [(-1, -1, ""), (tgt_len, src_len, "")]
92
+
93
+ # for idx, tset_elm in enumerate(tset):
94
+ for tset_elm in tset:
95
  logger.debug("buff: %s", buff)
96
  # postion max in ymax and insert in buff
97
  # if with range given by iset+-delta and
 
156
 
157
  Refer to _gen_pset.
158
  """
159
+ del verbose
160
  gen_pset.min_samples = min_samples
161
  for min_s in range(min_samples):
162
  logger.debug(" min_samples, try %s", min_samples - min_s)
radiobee/gen_row_alignment.py CHANGED
@@ -35,7 +35,7 @@ idx += 1; i0, i1, i2 = resu[idx]; '***' if i0 == ''
35
  else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
36
  if i2 == '' else i2
37
  """
38
- # pylint: disable=line-too-long
39
  from typing import List, Union
40
 
41
  # natural extrapolation with slope equal to 1
 
35
  else src_text[int(i0)], '***' if i1 == '' else tgt_text[int(i1)], ''
36
  if i2 == '' else i2
37
  """
38
+ # pylint: disable=line-too-long, unused-variable
39
  from typing import List, Union
40
 
41
  # natural extrapolation with slope equal to 1
radiobee/gen_vector.py CHANGED
@@ -9,11 +9,11 @@ from radiobee.insert_spaces import insert_spaces
9
 
10
 
11
  def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
12
- """Gen vector for a give model.
13
 
14
  Args:
15
  text: string of Chinese chars or English words.
16
-
17
  filename = r"data\test-dual.txt"
18
  text = loadtext(filename)
19
  list1, list2 = zip(*text2lists(text))
 
9
 
10
 
11
  def gen_vector(text: Union[str, List[str]], model: Vectorizer) -> List[float]:
12
+ r"""Gen vector for a give model.
13
 
14
  Args:
15
  text: string of Chinese chars or English words.
16
+ model: model used
17
  filename = r"data\test-dual.txt"
18
  text = loadtext(filename)
19
  list1, list2 = zip(*text2lists(text))
radiobee/gradiobee.py CHANGED
@@ -1,5 +1,5 @@
1
  """Gradiobee."""
2
- # pylint: disable=invalid-name
3
  from pathlib import Path
4
  import platform
5
  import inspect
@@ -12,9 +12,9 @@ from fastlid import fastlid
12
  from logzero import logger
13
  from icecream import ic
14
 
15
- import numpy as np
16
  import pandas as pd
17
- import matplotlib
18
  import matplotlib.pyplot as plt
19
  import seaborn as sns
20
 
@@ -32,11 +32,8 @@ from radiobee.text2lists import text2lists
32
 
33
  uname = platform.uname()
34
  HFSPACES = False
35
- # if "amzn2" in uname.release: # on hf spaces
36
- if True:
37
  HFSPACES = True
38
- from sentence_transformers import SentenceTransformer # noqa
39
- model_s = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
40
 
41
  sns.set()
42
  sns.set_style("darkgrid")
@@ -191,8 +188,8 @@ def gradiobee(
191
 
192
  logger.debug("lang1: %s, lang2: %s", lang1, lang2)
193
  if debug:
194
- print("gradiobee.py ln 82 lang1: %s, lang2: %s" % (lang1, lang2))
195
- print("fast track? ", lang1 in lang_en_zh and lang2 in lang_en_zh)
196
 
197
  # fast track
198
  if lang1 in lang_en_zh and lang2 in lang_en_zh:
@@ -225,6 +222,7 @@ def gradiobee(
225
  )
226
  return error_msg(msg, "info ")
227
  try:
 
228
  vec1 = model_s.encode(list1)
229
  vec2 = model_s.encode(list2)
230
  # cmat = vec1.dot(vec2.T)
 
1
  """Gradiobee."""
2
+ # pylint: disable=invalid-name, too-many-arguments, too-many-branches, too-many-locals, too-many-statements, unused-variable, too-many-return-statements, unused-import
3
  from pathlib import Path
4
  import platform
5
  import inspect
 
12
  from logzero import logger
13
  from icecream import ic
14
 
15
+ import numpy as np # noqa
16
  import pandas as pd
17
+ import matplotlib # noqa
18
  import matplotlib.pyplot as plt
19
  import seaborn as sns
20
 
 
32
 
33
  uname = platform.uname()
34
  HFSPACES = False
35
+ if "amzn2" in uname.release: # on hf spaces
 
36
  HFSPACES = True
 
 
37
 
38
  sns.set()
39
  sns.set_style("darkgrid")
 
188
 
189
  logger.debug("lang1: %s, lang2: %s", lang1, lang2)
190
  if debug:
191
+ ic(f" lang1: {lang1}, lang2: {lang2}")
192
+ ic("fast track? ", lang1 in lang_en_zh and lang2 in lang_en_zh)
193
 
194
  # fast track
195
  if lang1 in lang_en_zh and lang2 in lang_en_zh:
 
222
  )
223
  return error_msg(msg, "info ")
224
  try:
225
+ from radiobee.model_s import model_s # pylint: disable=import-outside-toplevel
226
  vec1 = model_s.encode(list1)
227
  vec2 = model_s.encode(list2)
228
  # cmat = vec1.dot(vec2.T)
radiobee/interpolate_pset.py CHANGED
@@ -1,4 +1,5 @@
1
  """Interpolate np.nan."""
 
2
  from typing import List, Tuple
3
  import numpy as np
4
  import pandas as pd
 
1
  """Interpolate np.nan."""
2
+ # pylint: disable=invalid-name
3
  from typing import List, Tuple
4
  import numpy as np
5
  import pandas as pd
radiobee/lists2cmat.py CHANGED
@@ -1,5 +1,6 @@
1
  """Convert two lists of str (texts) to correlation matrix."""
2
- # from typing import Dict, Iterable, Optional, Union
 
3
  from typing import Dict, Iterable, List, Optional, Union # noqa
4
 
5
  import numpy as np
@@ -32,6 +33,26 @@ def lists2cmat(
32
  vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
33
  ) -> np.ndarray:
34
  # fmt: on
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  if isinstance(text1, str):
36
  text1 = [text1]
37
  if isinstance(text2, str):
 
1
  """Convert two lists of str (texts) to correlation matrix."""
2
+ # pylint: disable=too-many-arguments, too-many-locals, unused-import
3
+
4
  from typing import Dict, Iterable, List, Optional, Union # noqa
5
 
6
  import numpy as np
 
33
  vocabulary_terms: Optional[Union[Dict[str, int], Iterable[str]]] = None
34
  ) -> np.ndarray:
35
  # fmt: on
36
+ """Convert two lists to cmat.
37
+
38
+ Args:
39
+ text1: refer smatrix
40
+ text2: refer smatrix
41
+ lang1: optional 1st lang code
42
+ lang2: optional 2nd lang code
43
+ dl_type: doc lenth
44
+ idf_type: idf tyoe
45
+ max_df: max doc freq
46
+ max_n_terms: max n terms
47
+ min_df: min doc freq
48
+ model: optional model
49
+ norm: norm
50
+ tf_type: term freq type
51
+ vocabulary_terms: vocab refer smatrix
52
+
53
+ Returs
54
+ cmat
55
+ """
56
  if isinstance(text1, str):
57
  text1 = [text1]
58
  if isinstance(text2, str):
radiobee/loadtext.py CHANGED
@@ -1,5 +1,4 @@
1
- """
2
- Load file content to text.
3
 
4
  Check encoding and load a file to text.
5
 
@@ -16,6 +15,8 @@ magic.from_file("testdata/test.pdf")
16
  original load_textrev
17
  refer to load_paras.py
18
  """
 
 
19
  from typing import Optional, Union # noqa
20
  from pathlib import Path
21
  import cchardet
@@ -34,7 +35,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
34
  if not filepath.is_file():
35
  logger.error(" file [%s] does not exist or is not a file.", filepath)
36
  # return None
37
- raise Exception(" file [%s] does not exist or is not a file." % filepath)
38
 
39
  # encoding = detect_file(filepath)
40
  encoding = cchardet.detect(filepath.read_bytes()).get("encoding", "utf8")
@@ -44,7 +45,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
44
 
45
  # cchardet: 'GB18030', no need for errors="ignore"
46
  try:
47
- text = filepath.read_text(encoding, errors="ignore")
48
  except Exception as exc:
49
  logger.error(" Opening %s resulted in errors: %s", filepath, exc)
50
  raise
@@ -53,8 +54,7 @@ def loadtext(filepath: Union[Path, str] = "") -> str:
53
 
54
 
55
  def test1():
56
- r"""
57
- Tests default file.
58
 
59
  defaultdir = r'D:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
60
  defaultfile = r'notes pyqt tkinter tktable.txt'
@@ -69,10 +69,11 @@ def test1():
69
 
70
 
71
  def testgb():
72
- r"""
73
- Tests D:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT
74
- """
75
- file = r"C:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT" r"\爱丽丝漫游奇境记.txt"
 
76
  text = loadtext(file)
77
  if text:
78
  # assert len(text) == 190913
@@ -84,10 +85,8 @@ def testgb():
84
  assert text0 == text[:500]
85
 
86
 
87
- def testUTF_16LE():
88
- r"""
89
- Test 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'.
90
- """
91
  # file = 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt' # NOQA
92
  file = r"C:\dl\Dropbox\mat-dir\snippets-mat\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt" # NOQA
93
  file = r"C:\dl\Dropbox\mat-dir\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"
 
1
+ """Load file content to text.
 
2
 
3
  Check encoding and load a file to text.
4
 
 
15
  original load_textrev
16
  refer to load_paras.py
17
  """
18
+ # pylint: disable=line-too-long, unused-variable, unused-import
19
+
20
  from typing import Optional, Union # noqa
21
  from pathlib import Path
22
  import cchardet
 
35
  if not filepath.is_file():
36
  logger.error(" file [%s] does not exist or is not a file.", filepath)
37
  # return None
38
+ raise Exception(" file [{filepath}] does not exist or is not a file.")
39
 
40
  # encoding = detect_file(filepath)
41
  encoding = cchardet.detect(filepath.read_bytes()).get("encoding", "utf8")
 
45
 
46
  # cchardet: 'GB18030', no need for errors="ignore"
47
  try:
48
+ text = filepath.read_text(encoding=encoding, errors="ignore")
49
  except Exception as exc:
50
  logger.error(" Opening %s resulted in errors: %s", filepath, exc)
51
  raise
 
54
 
55
 
56
  def test1():
57
+ r"""Tests default file.
 
58
 
59
  defaultdir = r'D:\dl\Dropbox\mat-dir\snippets-mat\pyqt'
60
  defaultfile = r'notes pyqt tkinter tktable.txt'
 
69
 
70
 
71
  def testgb():
72
+ r"""Tests shuangyu_ku\txt-books\19部世界名著中英文对照版TXT."""
73
+ file = (
74
+ r"C:\dl\Dropbox\shuangyu_ku\txt-books\19部世界名著中英文对照版TXT"
75
+ r"\爱丽丝漫游奇境记.txt"
76
+ )
77
  text = loadtext(file)
78
  if text:
79
  # assert len(text) == 190913
 
85
  assert text0 == text[:500]
86
 
87
 
88
+ def test_utf_16le():
89
+ r"""Test 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt'."""
 
 
90
  # file = 'E:\\beta_final_version\\build\\test_files\\files_for_testing_import\\Folding_Beijing_12.txt' # NOQA
91
  file = r"C:\dl\Dropbox\mat-dir\snippets-mat\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt" # NOQA
92
  file = r"C:\dl\Dropbox\mat-dir\pyqt\Sandbox\hp_beta-version_files\test_files\files_for_testing_import\Folding_Beijing_12.txt"
radiobee/mdx_e2c.py CHANGED
@@ -3,6 +3,7 @@
3
  mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
4
  mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
5
  """
 
6
  from pathlib import Path
7
  from string import punctuation
8
  import joblib
 
3
  mdx_e2c = joblib.load("./mdx_dict_e2c.lzma")
4
  mdx_c2e = joblib.load("./mdx_dict_e2c.lzma")
5
  """
6
+ # pylint: disable=invalid-name,
7
  from pathlib import Path
8
  from string import punctuation
9
  import joblib
radiobee/model_s.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Load model_s."""
2
+ # pylint: disable=invalid-name
3
+
4
+ from pathlib import Path
5
+
6
+ import joblib
7
+ from huggingface_hub import hf_hub_url, cached_download # hf_hub_download,
8
+ from alive_progress import alive_bar
9
+ from logzero import logger
10
+
11
+
12
+ def load_model_s():
13
+ """Load local model_s if present, else fetch from hf.co."""
14
+ file_loc = "radiobee/model_s"
15
+ if Path(file_loc).exists():
16
+ # raise Exception(f"File {file_loc} does not exist.")
17
+
18
+ with alive_bar(1, title=" Loading model_s, takes ~30 secs ...", length=3) as progress_bar:
19
+ model = joblib.load(file_loc)
20
+
21
+ # model_s = pickle.load(open(file_loc, "rb"))
22
+ progress_bar() # pylint: disable=not-callable
23
+
24
+ return model
25
+
26
+ logger.info(
27
+ "Fetching and caching model_s from huggingface.co... "
28
+ "The first time may take a while depending on your net."
29
+ )
30
+ with alive_bar(1, title=" Subsequent loading takes ~20 secs ...", length=3) as progress_bar:
31
+ model = joblib.load(cached_download(hf_hub_url("mikeee/model_s", "model_s")))
32
+ progress_bar() # pylint: disable=not-callable
33
+
34
+ return model
35
+
36
+
37
+ model_s = load_model_s()
radiobee/paras2sents.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Convert paras to sents."""
2
+ # pylint: disable=unused-import, too-many-branches, ungrouped-imports
3
+
4
+ from typing import Callable, List, Optional, Tuple, Union
5
+
6
+ from itertools import zip_longest
7
+ import numpy as np
8
+ import pandas as pd
9
+ from logzero import logger
10
+
11
+ from radiobee.align_sents import align_sents
12
+ from radiobee.seg_text import seg_text
13
+ from radiobee.detect import detect
14
+
15
+ try:
16
+ from radiobee.shuffle_sents import shuffle_sents
17
+ except Exception as exc:
18
+ logger.error("shuffle_sents not available: %s, using align_sents", exc)
19
+ shuffle_sents = lambda x1, x2, lang1="", lang2="": align_sents(x1, x2) # noqa
20
+
21
+
22
+ def paras2sents(
23
+ paras_: Union[pd.DataFrame, List[Tuple[str, str, Union[str, float]]], np.ndarray],
24
+ align_func: Optional[Union[Callable, str]] = None,
25
+ lang1: Optional[str] = None,
26
+ lang2: Optional[str] = None,
27
+ ) -> List[Tuple[str, str, Union[str, float]]]:
28
+ """Convert paras to sents using align_func.
29
+
30
+ Args:
31
+ paras_: list of 3-tuples or numpy or pd.DataFrame
32
+ lang1: fisrt lang code
33
+ lang2: second lang code
34
+ align_func: func used in the sent level
35
+ if set to None, default to align_sents
36
+ Returns:
37
+ list of sents (possible with likelihood for shuffle_sents)
38
+ """
39
+ # wrap everything in pd.DataFrame
40
+ # necessary to make pyright happy
41
+ paras = pd.DataFrame(paras_).fillna("")
42
+
43
+ # take the first three columns at maximum
44
+ paras = paras.iloc[:, :3]
45
+
46
+ if len(paras.columns) < 2:
47
+ logger.error(
48
+ "Need at least two columns, got %s",
49
+ len(paras.columns)
50
+ )
51
+ raise Exception("wrong data")
52
+
53
+ # append the third col (all "") if there are only two cols
54
+ if len(paras.columns) < 3:
55
+ paras.insert(2, "likelihood", [""] * len(paras))
56
+
57
+ if lang1 is None:
58
+ lang1 = detect(" ".join(paras.iloc[:, 0]))
59
+ if lang2 is None:
60
+ lang2 = detect(" ".join(paras.iloc[:, 1]))
61
+
62
+ left, right = [], []
63
+ row0, row1 = [], []
64
+ for elm0, elm1, elm2 in paras.values:
65
+ sents0 = seg_text(elm0, lang1)
66
+ sents1 = seg_text(elm1, lang2)
67
+ if isinstance(elm2, float) and elm2 > 0:
68
+ if row0 or row1:
69
+ left.append(row0)
70
+ right.append(row1)
71
+ row0, row1 = [], [] # collect and prepare
72
+
73
+ if sents0:
74
+ left.append(sents0)
75
+ if sents1:
76
+ right.append(sents1)
77
+ else:
78
+ if sents0:
79
+ row0.extend(sents0)
80
+ if sents1:
81
+ row1.extend(sents1)
82
+ # collect possible last batch
83
+ if row0 or row1:
84
+ left.append(row0)
85
+ right.append(row1)
86
+
87
+ # res = [*zip(left, right)]
88
+
89
+ # align each batch using align_func
90
+
91
+ # ready align_func
92
+ if align_func is None:
93
+ align_func = align_sents
94
+ if isinstance(align_func, str) and align_func.startswith("shuffle") or not isinstance(align_func, str) and align_func.__name__ in ["shuffle_sents"]:
95
+ align_func = lambda row0, row1: shuffle_sents(row0, row1, lang1=lang1, lang2=lang2) # noqa
96
+ else:
97
+ align_func = align_sents
98
+
99
+ res = []
100
+ for row0, row1 in zip(left, right):
101
+ try:
102
+ _ = align_func(row0, row1)
103
+ except Exception as exc:
104
+ logger.error("errors: %s, resorting to zip_longest", exc)
105
+ _ = [*zip_longest(row0, row1, fillvalue="")]
106
+
107
+ # res.append(_)
108
+ res.extend(_)
109
+
110
+ return res
radiobee/plot_cmat.py CHANGED
@@ -1,5 +1,6 @@
1
  """Plot pandas.DataFrame with DBSCAN clustering."""
2
- # pylint: disable=invalid-name, too-many-arguments
 
3
  import numpy as np
4
  import pandas as pd
5
  import matplotlib
@@ -37,13 +38,13 @@ def plot_cmat(
37
  backend: str = "Agg",
38
  showfig: bool = False,
39
  ):
40
- # ) -> plt:
41
  # fmt: on
42
  """Plot df with DBSCAN clustering.
43
 
44
  Args:
45
  df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
46
- Returns:
 
47
  matplotlib.pyplot: for possible use in gradio
48
 
49
  plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
 
1
  """Plot pandas.DataFrame with DBSCAN clustering."""
2
+ # pylint: disable=invalid-name, too-many-arguments, too-many-locals
3
+
4
  import numpy as np
5
  import pandas as pd
6
  import matplotlib
 
38
  backend: str = "Agg",
39
  showfig: bool = False,
40
  ):
 
41
  # fmt: on
42
  """Plot df with DBSCAN clustering.
43
 
44
  Args:
45
  df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
46
+
47
+ Returns
48
  matplotlib.pyplot: for possible use in gradio
49
 
50
  plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']))
radiobee/plot_df.py CHANGED
@@ -1,5 +1,5 @@
1
  """Plot pandas.DataFrame with DBSCAN clustering."""
2
- # pylint: disable=invalid-name, too-many-arguments
3
  import numpy as np # noqa
4
  import pandas as pd
5
  import matplotlib
@@ -38,6 +38,7 @@ def plot_df(
38
 
39
  Args:
40
  df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
 
41
  Returns:
42
  matplotlib.pyplot: for possible use in gradio
43
 
 
1
  """Plot pandas.DataFrame with DBSCAN clustering."""
2
+ # pylint: disable=invalid-name, too-many-arguments, unused-import
3
  import numpy as np # noqa
4
  import pandas as pd
5
  import matplotlib
 
38
 
39
  Args:
40
  df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"]
41
+
42
  Returns:
43
  matplotlib.pyplot: for possible use in gradio
44
 
radiobee/process_upload.py CHANGED
@@ -1,4 +1,5 @@
1
  """Process uploads."""
 
2
  from typing import Union
3
 
4
  from pathlib import Path
@@ -51,7 +52,7 @@ def process_upload(upload: Union[tempfile._TemporaryFileWrapper, bytes]) -> str:
51
 
52
  if encoding is not None:
53
  try:
54
- text = fpath.read_text(encoding)
55
  except Exception as e:
56
  logger.error("Unable to retrieve text, error: %s", e)
57
  text = str(e)
@@ -63,7 +64,7 @@ def process_upload(upload: Union[tempfile._TemporaryFileWrapper, bytes]) -> str:
63
  # not able to cchardet: encoding is None, docx, pdf, epub, zip etc
64
  logger.info("Trying docx...to be implemented")
65
 
66
- # TODO
67
 
68
  _ = Path(upload.name)
69
  msg = f"binary file: {_.stem[:-8]}{_.suffix}"
 
1
  """Process uploads."""
2
+ # pylint: disable=invalid-name, unused-import
3
  from typing import Union
4
 
5
  from pathlib import Path
 
52
 
53
  if encoding is not None:
54
  try:
55
+ text = fpath.read_text(encoding=encoding)
56
  except Exception as e:
57
  logger.error("Unable to retrieve text, error: %s", e)
58
  text = str(e)
 
64
  # not able to cchardet: encoding is None, docx, pdf, epub, zip etc
65
  logger.info("Trying docx...to be implemented")
66
 
67
+ # T ODO .docx .epub .mobi .pdf etc.
68
 
69
  _ = Path(upload.name)
70
  msg = f"binary file: {_.stem[:-8]}{_.suffix}"
radiobee/seg_text.py CHANGED
@@ -97,6 +97,8 @@ def seg_text(
97
 
98
  Arguments:
99
  lst: text or text list
 
 
100
  extra: re.split(rf"{extra}, text) first
101
  Returns:
102
  list of splitted text.
 
97
 
98
  Arguments:
99
  lst: text or text list
100
+ lang: optional lang code
101
+ maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
102
  extra: re.split(rf"{extra}, text) first
103
  Returns:
104
  list of splitted text.
radiobee/shuffle_sents.py CHANGED
@@ -1,8 +1,9 @@
1
  """Shuffle sents."""
2
- # pylint: disable=
3
 
4
  from typing import List, Optional, Tuple, Union
5
 
 
6
  from fastlid import fastlid
7
  from logzero import logger # noqa
8
 
@@ -26,12 +27,23 @@ def shuffle_sents(
26
  lang2: Optional[str] = None,
27
  ) -> List[Tuple[str, str, Union[str, float]]]:
28
  # fmt: on
29
- """shuffle sents to the right positions.
30
 
31
  Based on __main__.py.
 
 
 
 
 
 
 
 
 
32
  """
33
  set_languages = fastlid.set_languages
34
- fastlid.set_languages = ["en", "zh"]
 
 
35
  if lang1 is None:
36
  lang1, _ = fastlid(" ".join(lst1))
37
  if lang2 is None:
@@ -40,16 +52,28 @@ def shuffle_sents(
40
  # restore fastlid.set_languages
41
  fastlid.set_languages = set_languages
42
 
43
- cmat = lists2cmat(
44
- lst1,
45
- lst2,
46
- tf_type=tf_type,
47
- idf_type=idf_type,
48
- dl_type=dl_type,
49
- norm=norm,
50
- lang1=lang1,
51
- lang2=lang2,
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  pset = gen_pset(
55
  cmat,
@@ -63,6 +87,11 @@ def shuffle_sents(
63
 
64
  final_list = align_texts(aset, lst2, lst1)
65
 
66
- return final_list
 
 
 
 
 
67
 
68
- # return [("", "")]
 
1
  """Shuffle sents."""
2
+ # pylint: disable=unused-import, too-many-arguments, too-many-locals,
3
 
4
  from typing import List, Optional, Tuple, Union
5
 
6
+ import pandas as pd
7
  from fastlid import fastlid
8
  from logzero import logger # noqa
9
 
 
27
  lang2: Optional[str] = None,
28
  ) -> List[Tuple[str, str, Union[str, float]]]:
29
  # fmt: on
30
+ """Shuffle sents to the right positions.
31
 
32
  Based on __main__.py.
33
+
34
+ eps: float = 6
35
+ min_samples: int = 4
36
+ tf_type: str = "linear"
37
+ idf_type: Optional[str] = None
38
+ dl_type: Optional[str] = None
39
+ norm: Optional[str] = None
40
+ lang1: Optional[str] = "en"
41
+ lang2: Optional[str] = "zh"
42
  """
43
  set_languages = fastlid.set_languages
44
+ # fastlid.set_languages = ["en", "zh"]
45
+ fastlid.set_languages = None
46
+
47
  if lang1 is None:
48
  lang1, _ = fastlid(" ".join(lst1))
49
  if lang2 is None:
 
52
  # restore fastlid.set_languages
53
  fastlid.set_languages = set_languages
54
 
55
+ lang_dicts = ["en", "zh"]
56
+ if lang1 in lang_dicts and lang2 in lang_dicts:
57
+ cmat = lists2cmat(
58
+ lst1,
59
+ lst2,
60
+ tf_type=tf_type,
61
+ idf_type=idf_type,
62
+ dl_type=dl_type,
63
+ norm=norm,
64
+ lang1=lang1,
65
+ lang2=lang2,
66
+ )
67
+ else: # use model_s
68
+ from radiobee.model_s import model_s # pylint: disable=import-outside-toplevel
69
+ vec1 = model_s.encode(lst1)
70
+ vec2 = model_s.encode(lst2)
71
+ # cmat = vec1.dot(vec2.T)
72
+ cmat = vec2.dot(vec1.T)
73
+
74
+ shuffle_sents.cmat = cmat
75
+ shuffle_sents.lang1 = lang1
76
+ shuffle_sents.lang2 = lang2
77
 
78
  pset = gen_pset(
79
  cmat,
 
87
 
88
  final_list = align_texts(aset, lst2, lst1)
89
 
90
+ # return final_list
91
+
92
+ # swap columns 0, 1
93
+ _ = pd.DataFrame(final_list)
94
+
95
+ _ = _.iloc[:, [1, 0] + [*range(2, _.shape[1])]]
96
 
97
+ return _.to_numpy().tolist()
radiobee/smatrix.py CHANGED
@@ -3,13 +3,16 @@
3
  refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
4
  originally docterm_scores.py.
5
  """
 
 
6
  from typing import Dict, Iterable, Optional, Union
7
- import numpy as np
8
  from itertools import chain
 
9
  from psutil import virtual_memory
10
  from more_itertools import ilen
11
 
12
  from textacy.representations import Vectorizer
 
13
  # from textacy.representations.vectorizers import Vectorizer
14
  from logzero import logger
15
 
@@ -51,8 +54,8 @@ def smatrix(
51
  for xelm in iter(doc1):
52
  for elm in iter(xelm):
53
  assert isinstance(elm, str)
54
- except AssertionError:
55
- raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ")
56
  except Exception as e:
57
  logger.error(e)
58
  raise
@@ -60,8 +63,8 @@ def smatrix(
60
  for xelm in iter(doc2):
61
  for elm in iter(xelm):
62
  assert isinstance(elm, str)
63
- except AssertionError:
64
- raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ")
65
  except Exception as e:
66
  logger.error(e)
67
  raise
 
3
  refer also to fast-scores fast_scores.py and gen_model.py (sklearn.feature_extraction.text.TfidfVectorizer).
4
  originally docterm_scores.py.
5
  """
6
+ # pylint: disable=invalid-name, too-many-locals, too-many-arguments
7
+
8
  from typing import Dict, Iterable, Optional, Union
 
9
  from itertools import chain
10
+ import numpy as np
11
  from psutil import virtual_memory
12
  from more_itertools import ilen
13
 
14
  from textacy.representations import Vectorizer
15
+
16
  # from textacy.representations.vectorizers import Vectorizer
17
  from logzero import logger
18
 
 
54
  for xelm in iter(doc1):
55
  for elm in iter(xelm):
56
  assert isinstance(elm, str)
57
+ except AssertionError as exc:
58
+ raise AssertionError(" doc1 is not of the typing Iterable[Iterable[str]] ") from exc
59
  except Exception as e:
60
  logger.error(e)
61
  raise
 
63
  for xelm in iter(doc2):
64
  for elm in iter(xelm):
65
  assert isinstance(elm, str)
66
+ except AssertionError as exc:
67
+ raise AssertionError(" doc2 is not of the typing Iterable[Iterable[str]] ") from exc
68
  except Exception as e:
69
  logger.error(e)
70
  raise
radiobee/text2lists.py CHANGED
@@ -1,5 +1,6 @@
1
  """Separate text to zh en lists."""
2
- # pylint: disable=
 
3
 
4
  # from typing import Tuple,
5
  from typing import Iterable, List, Optional, Tuple, Union # noqa
 
1
  """Separate text to zh en lists."""
2
+ # pylint: disable=unused-import, too-many-locals, invalid-name, too-many-branches, too-many-statements,
3
+
4
 
5
  # from typing import Tuple,
6
  from typing import Iterable, List, Optional, Tuple, Union # noqa
requirements.txt CHANGED
@@ -24,4 +24,6 @@ pycld2
24
  tqdm
25
  polyglot
26
  sentence_splitter
27
- icecream
 
 
 
24
  tqdm
25
  polyglot
26
  sentence_splitter
27
+ icecream
28
+ # lazy
29
+ alive-progress
run-pydocstle.bat ADDED
@@ -0,0 +1 @@
 
 
1
+ pydocstyle --convention=google radiobee tests
run-pylint.bat ADDED
@@ -0,0 +1 @@
 
 
1
+ pylint radiobee -d duplicate-code
tests/test_align_sents.py CHANGED
@@ -1,9 +1,14 @@
1
  """Test align_sents."""
2
  from radiobee.align_sents import align_sents
 
3
 
 
 
 
4
 
5
- def test_align_sents():
6
- """Test align_sents."""
 
7
  lst1, lst2 = [
8
  "a",
9
  "bs",
@@ -11,3 +16,56 @@ def test_align_sents():
11
  res = align_sents(lst1, lst2)
12
 
13
  assert res == [("a", "aaa"), ("a", "34"), ("bs", "a"), ("bs", "b")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Test align_sents."""
2
  from radiobee.align_sents import align_sents
3
+ from radiobee.seg_text import seg_text
4
 
5
+ text1 = """`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn."""
6
+ text2 = """“被囚禁的囚犯!”我在精神上被射精,“你应该永远与你的物种隔绝,因为你这种粗鲁的病态。至少,我白天不会锁门,我不在乎,我进去了!”我决心如此,我抓住了门锁,狠狠地摇了一下。醋脸的约瑟夫从谷仓的圆窗朝他的头照射。"""
7
+ text3 = """"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit. Zumindest würde ich meine Türen tagsüber nicht verriegeln. Das ist mir egal - ich werde reinkommen!' So entschlossen, ergriff ich die Klinke und rüttelte heftig daran. Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune."""
8
 
9
+
10
+ def test_align_sents_sanity():
11
+ """Test align_sents sanity check."""
12
  lst1, lst2 = [
13
  "a",
14
  "bs",
 
16
  res = align_sents(lst1, lst2)
17
 
18
  assert res == [("a", "aaa"), ("a", "34"), ("bs", "a"), ("bs", "b")]
19
+
20
+
21
+ def test_align_sents_en_zh():
22
+ """Test align_sents en-zh."""
23
+ sents_en = seg_text(text1)
24
+ sents_zh = seg_text(text2)
25
+
26
+ # 9ms vs shuffle_sents 50ms shuffle_sents wth lang1lang2 40ms
27
+ res = align_sents(sents_en, sents_zh)
28
+
29
+ _ = """res[2:4]
30
+ Out[26]:
31
+ [('At least, I would not keep my doors barred in the day time.',
32
+ '至少,我白天不会锁门,我不在乎,我进去了!”'),
33
+ ("I don't care--I will get in!'", '至少,我白天不会锁门,我不在乎,我进去了!”')]
34
+ """
35
+ assert "至少" in str(res[2])
36
+ assert "至少" in str(res[3])
37
+
38
+
39
+ def test_align_sents_en_de():
40
+ """Test align_sents en-zh."""
41
+ sents_en = seg_text(text1)
42
+ sents_de = seg_text(text3)
43
+
44
+ res1 = align_sents(sents_en, sents_de)
45
+ _ = """In [48]: res1[:2]
46
+ Out[48]:
47
+ [("`Wretched inmates!'",
48
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
49
+ ('I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
50
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.')]
51
+ """
52
+ assert "Elende" in str(res1[0])
53
+ assert "Elende" in str(res1[1])
54
+
55
+
56
+ _ = """
57
+ [("`Wretched inmates!'",
58
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
59
+ ('I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
60
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.'),
61
+ ('At least, I would not keep my doors barred in the day time.',
62
+ 'Zumindest würde ich meine Türen tagsüber nicht verriegeln.'),
63
+ ("I don't care--I will get in!'",
64
+ "Das ist mir egal - ich werde reinkommen!'"),
65
+ ('So resolved, I grasped the latch and shook it vehemently.',
66
+ 'So entschlossen, ergriff ich die Klinke und rüttelte heftig daran.'),
67
+ ('Vinegar-faced Joseph projected his head from a round window of the barn.',
68
+ 'Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune.')]
69
+
70
+
71
+ """
tests/test_lists2cmat_hlm.py CHANGED
@@ -37,9 +37,9 @@ def test_lists2cmat_hlm():
37
  # cmat = texts2cmat(lst1, lst2, lang1, lang2)
38
  cmat = lists2cmat(lst1, lst2, lang1, lang2)
39
 
40
- assert cmat.shape == (36, 33)
41
 
42
  cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
43
 
44
- assert cmat21.shape == (33, 36)
45
  assert lists2cmat(lst2, lst1).mean() > 0.05 # 0.09
 
37
  # cmat = texts2cmat(lst1, lst2, lang1, lang2)
38
  cmat = lists2cmat(lst1, lst2, lang1, lang2)
39
 
40
+ assert cmat.shape == (55, 135)
41
 
42
  cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
43
 
44
+ assert cmat21.shape == (135, 55)
45
  assert lists2cmat(lst2, lst1).mean() > 0.05 # 0.09
tests/test_paras2sents.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test paras2sents."""
2
+ # pylint: disable=invalid-name
3
+
4
+ import pandas as pd
5
+ from radiobee.paras2sents import paras2sents
6
+ from radiobee.shuffle_sents import shuffle_sents
7
+
8
+ file_loc = r"data/test-dual-zh-en.xlsx"
9
+ paras = pd.read_excel(file_loc, header=0)
10
+ paras = paras[["text1", "text2", "likelihood"]].fillna("")
11
+
12
+
13
+ def test_paras2sents_dual():
14
+ """Test paras2sents_dual."""
15
+ sents = paras2sents(paras)
16
+
17
+ assert len(sents) > 202 # 208
18
+ # assert not sents
19
+
20
+
21
+ def test_paras2sents_dual_model_s():
22
+ """Test paras2sents_dual_model_s."""
23
+ sents = paras2sents(paras, shuffle_sents)
24
+
25
+ assert len(sents) > 201 # 207
26
+ # assert not sents
27
+
28
+
29
+ _ = """
30
+ df = pd.DataFrame(
31
+ [list(sent) + [""] if len(sent) == 2 else list(sent) for sent in sents]
32
+ ).fillna("")
33
+
34
+ """
tests/test_shuffle_sents.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Test shuffle_sents.
2
+
3
+ eps: float = 6
4
+ min_samples: int = 4
5
+ tf_type: str = "linear"
6
+ idf_type: Optional[str] = None
7
+ dl_type: Optional[str] = None
8
+ norm: Optional[str] = None
9
+ lang1: Optional[str] = "en"
10
+ lang2: Optional[str] = "zh"
11
+ """
12
+ from radiobee.seg_text import seg_text
13
+ from radiobee.shuffle_sents import shuffle_sents
14
+ from radiobee.align_sents import align_sents
15
+
16
+ text1 = """`Wretched inmates!' I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality. At least, I would not keep my doors barred in the day time. I don't care--I will get in!' So resolved, I grasped the latch and shook it vehemently. Vinegar-faced Joseph projected his head from a round window of the barn."""
17
+ text2 = """“被囚禁的囚犯!”我在精神上被射精,“你应该永远与你的物种隔绝,因为你这种粗鲁的病态。至少,我白天不会锁门,我不在乎,我进去了!”我决心如此,我抓住了门锁,狠狠地摇了一下。醋脸的约瑟夫从谷仓的圆窗朝他的头照射。"""
18
+ text3 = """"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit. Zumindest würde ich meine Türen tagsüber nicht verriegeln. Das ist mir egal - ich werde reinkommen!' So entschlossen, ergriff ich die Klinke und rüttelte heftig daran. Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune."""
19
+
20
+
21
+ def test_shuffle_sents_en_zh():
22
+ """Test shuffle_sents_en_zh."""
23
+ sents_en = seg_text(text1)
24
+ sents_zh = seg_text(text2)
25
+
26
+ lang1 = "en"
27
+ lang2 = "zh"
28
+
29
+ pairs = shuffle_sents(sents_en, sents_zh)
30
+ pairs_ = shuffle_sents(sents_en, sents_zh, lang1=lang1, lang2=lang2)
31
+
32
+ # pairs[3] == ('', "I don't care--I will get in!'", '')
33
+ assert pairs == pairs_
34
+
35
+ # assert not pairs[3][0]
36
+ # after swapping
37
+ assert not pairs[3][1]
38
+
39
+
40
+ def test_shuffle_sents_en_de():
41
+ """Test shuffle_sents_en_de."""
42
+ sents_en = seg_text(text1)
43
+ sents_de = seg_text(text3)
44
+
45
+ lang1 = "en"
46
+ lang2 = "de"
47
+
48
+ pairs = shuffle_sents(sents_en, sents_de)
49
+ pairs_ = shuffle_sents(sents_en, sents_de, lang1=lang1, lang2=lang2)
50
+
51
+ assert pairs == pairs_
52
+
53
+ #
54
+ # assert not pairs[3][0]
55
+ _ = """In [218]: pairs[:2]
56
+ Out[218]:
57
+ [["`Wretched inmates!'", '', ''],
58
+ ['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
59
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
60
+ 0.62]]
61
+ """
62
+ assert not pairs[0][1]
63
+ assert "mentally" in str(pairs[1]) and "Elende" in str(pairs[1])
64
+
65
+ # [elm[2] for elm in pairs]
66
+ # ['', 0.62, 0.72, 0.74, 0.68, 0.79]
67
+ if isinstance(pairs[1][2], float):
68
+ assert pairs[1][2] > 0.6
69
+ if isinstance(pairs[2][2], float):
70
+ assert pairs[2][2] > 0.7
71
+ if isinstance(pairs[3][2], float):
72
+ assert pairs[3][2] > 0.7
73
+ if isinstance(pairs[4][2], float):
74
+ assert pairs[4][2] > 0.6
75
+ if isinstance(pairs[5][2], float):
76
+ assert pairs[5][2] > 0.7
77
+
78
+
79
+ _ = """
80
+ In [232]: shuffle_sents.cmat.round(2)
81
+ Out[232]:
82
+ array([[ 0.27, 0.62, 0.07, 0.11, 0.02, 0.02],
83
+ [ 0.03, 0.09, 0.72, 0.18, 0.07, -0.07],
84
+ [ 0.19, 0.07, 0.16, 0.74, -0.01, -0.02],
85
+ [-0.02, 0.18, 0.16, 0.06, 0.68, -0.04],
86
+ [ 0.02, 0.07, 0.04, -0.04, 0.02, 0.79]], dtype=float32)
87
+ pairs[1]
88
+ sents_en[1], sents_de[0], shuffle_sents.cmat[0, 1]
89
+ ['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
90
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
91
+ 0.62]
92
+
93
+ pairs[2]
94
+ sents_en[2], sents_de[1], shuffle_sents.cmat[1, 2].round(2)
95
+ Out[244]:
96
+ ('At least, I would not keep my doors barred in the day time.',
97
+ 'Zumindest würde ich meine Türen tagsüber nicht verriegeln.',
98
+ 0.72)
99
+ ...
100
+
101
+ import mtplotlib
102
+ import matplotlib.pyplot as plt
103
+ import seaborn as sns
104
+
105
+ sns.set()
106
+ set_style("darkgrind")
107
+ plt.ion()
108
+
109
+ ali = shuffle_sents(sents_en, sents_de)
110
+ sns.heatmap(shuffle_sents.cmat, cmap="viridis_r").invert_yaxis()
111
+ ax = plt.gca()
112
+ ax.set_xlabel(shuffle_sents.lang1)
113
+ ax.set_ylabel(shuffle_sents.lang2)
114
+
115
+ ali == [["`Wretched inmates!'", '', ''],
116
+ ['I ejaculated mentally, `you deserve perpetual isolation from your species for your churlish inhospitality.',
117
+ '"Elende Insassen! ejakulierte ich im Geiste, "ihr verdient die ewige Isolation von eurer Spezies für eure rüpelhafte Ungastlichkeit.',
118
+ 0.62],
119
+ ['At least, I would not keep my doors barred in the day time.',
120
+ 'Zumindest würde ich meine Türen tagsüber nicht verriegeln.',
121
+ 0.72],
122
+ ["I don't care--I will get in!'",
123
+ "Das ist mir egal - ich werde reinkommen!'",
124
+ 0.74],
125
+ ['So resolved, I grasped the latch and shook it vehemently.',
126
+ 'So entschlossen, ergriff ich die Klinke und rüttelte heftig daran.',
127
+ 0.68],
128
+ ['Vinegar-faced Joseph projected his head from a round window of the barn.',
129
+ 'Der essiggesichtige Joseph streckte seinen Kopf aus einem runden Fenster der Scheune.',
130
+ 0.79]]
131
+
132
+ res1 = align_sents(sents_en, sents_de)
133
+ ali = shuffle_sents(sents_en, sents_de)
134
+ for idx in range(1, 6):
135
+ assert res1[idx] == tuple(ali[idx][:2])
136
+ """