Spaces:

mikeee
/

radiobee-aligner

Build error

App Files Files Community

freemt commited on Jan 15, 2022

Commit

811503f

1 Parent(s): 97fa834

Update increasing line limit 4000, 6000

Browse files

Files changed (3) hide show

gradio_queue.db +0 -0
radiobee/gradiobee.py +21 -8
tests/test_lists2cmat_hlm.py +45 -0

gradio_queue.db CHANGED Viewed

Binary files a/gradio_queue.db and b/gradio_queue.db differ

radiobee/gradiobee.py CHANGED Viewed

@@ -83,8 +83,6 @@ def gradiobee(
     if file2 is None:
         logger.debug("file2 is None")
         text2 = ""
-        # TODO split text1 to text1 and text2
     else:
         logger.debug("file2.name: %s", file2.name)
         text2 = file2text(file2)
@@ -108,9 +106,13 @@ def gradiobee(
         if not _:  # essentially empty file1
             return error_msg("Nothing worthy of processing in file 1")
         # exit if there are too many lines
-        if len(_) > len_max:
-            return error_msg(f" Too many lines ({len(_)}) > {len_max}, alignment op halted, sorry.", "info")
         _ = zip_longest(_, [""])
         _ = pd.DataFrame(_, columns=["text1", "text2"])
@@ -167,8 +169,12 @@ def gradiobee(
         # exit if there are too many lines
         len12 = len(list1) + len(list2)
-        if len12 > 2 * len_max:
-            return error_msg(f" Too many lines ({len(list1)} + {len(list2)} > {2 * len_max}), alignment op halted, sorry.", "info")
         file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
         file_dl_xlsx = Path(
@@ -201,9 +207,15 @@ def gradiobee(
             return error_msg(exc)
     # slow track
     else:
-        if len(list1) + len(list2) > 2000:
             msg = (
-                "This will take too long (> 2 minutes) to complete "
                 "and will hog this experimental server and hinder "
                 "other users from trying the service. "
                 "Aborted...sorry"
@@ -323,6 +335,7 @@ def gradiobee(
     fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
     _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
     # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
     _x = ~_

     if file2 is None:
         logger.debug("file2 is None")
         text2 = ""
     else:
         logger.debug("file2.name: %s", file2.name)
         text2 = file2text(file2)
         if not _:  # essentially empty file1
             return error_msg("Nothing worthy of processing in file 1")
+        logger.info(
+            "fast track single fiel: len %1, max %s",
+            len(_), 2 * len_max
+        )
         # exit if there are too many lines
+        if len(_) > 2 * len_max:
+            return error_msg(f" Too many lines ({len(_)}) > {2 * len_max}, alignment op halted, sorry.", "info")
         _ = zip_longest(_, [""])
         _ = pd.DataFrame(_, columns=["text1", "text2"])
         # exit if there are too many lines
         len12 = len(list1) + len(list2)
+        logger.info(
+            "fast track: len1 %s, len2 %s, tot %s, max %s",
+            len(list1), len(list2), len(list1) + len(list2), 3 * len_max
+        )
+        if len12 > 3 * len_max:
+            return error_msg(f" Too many lines ({len(list1)} + {len(list2)} > {3 * len_max}), alignment op halted, sorry.", "info")
         file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
         file_dl_xlsx = Path(
             return error_msg(exc)
     # slow track
     else:
+        logger.info(
+            "slow track: len1 %s, len2 %s, tot: %s, max %s",
+            len(list1), len(list2), len(list1) + len(list2),
+            3 * len_max
+        )
+        if len(list1) + len(list2) > 3 * len_max:
             msg = (
+                f" len1 {len(list1)} + len2 {len(list2)} > {3 * len_max}. "
+                "This will take too long to complete "
                 "and will hog this experimental server and hinder "
                 "other users from trying the service. "
                 "Aborted...sorry"
     fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
     _ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
     # _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
     _x = ~_

tests/test_lists2cmat_hlm.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Test lists2cmat."""
+# pylint: disable=invalid-name
+from itertools import zip_longest
+from fastlid import fastlid
+from radiobee.loadtext import loadtext
+from radiobee.lists2cmat import lists2cmat
+file1 = "data/test_en.txt"
+file2 = "data/test_zh.txt"
+file1 = "data/hlm-ch1-en.txt"
+file2 = "data/hlm-ch1-zh.txt"
+# assume English or Chinese
+fastlid.set_languages = ["en", "zh", ]
+text1 = loadtext(file1)
+text2 = loadtext(file2)
+lang1, _ = fastlid(text1)
+lang2, _ = fastlid(text2)
+def test_lists2cmat_hlm():
+    """Test lists2cmat."""
+    lst1, lst2 = [], []
+    if text1:
+        lst1 = [_.strip() for _ in text1.splitlines() if _.strip()]
+    if text2:
+        lst2 = [_.strip() for _ in text2.splitlines() if _.strip()]
+    # en                zh
+    len(lst1) == 135, len(lst2) == 55
+    # cmat = texts2cmat(lst1, lst2, lang1, lang2)
+    cmat = lists2cmat(lst1, lst2, lang1, lang2)
+    assert cmat.shape == (36, 33)
+    cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
+    assert cmat21.shape == (33, 36)
+    assert lists2cmat(lst2, lst1).mean() > 0.05  # 0.09