Spaces:
Build error
Build error
freemt
commited on
Commit
·
811503f
1
Parent(s):
97fa834
Update increasing line limit 4000, 6000
Browse files- gradio_queue.db +0 -0
- radiobee/gradiobee.py +21 -8
- tests/test_lists2cmat_hlm.py +45 -0
gradio_queue.db
CHANGED
Binary files a/gradio_queue.db and b/gradio_queue.db differ
|
|
radiobee/gradiobee.py
CHANGED
@@ -83,8 +83,6 @@ def gradiobee(
|
|
83 |
if file2 is None:
|
84 |
logger.debug("file2 is None")
|
85 |
text2 = ""
|
86 |
-
|
87 |
-
# TODO split text1 to text1 and text2
|
88 |
else:
|
89 |
logger.debug("file2.name: %s", file2.name)
|
90 |
text2 = file2text(file2)
|
@@ -108,9 +106,13 @@ def gradiobee(
|
|
108 |
if not _: # essentially empty file1
|
109 |
return error_msg("Nothing worthy of processing in file 1")
|
110 |
|
|
|
|
|
|
|
|
|
111 |
# exit if there are too many lines
|
112 |
-
if len(_) > len_max:
|
113 |
-
return error_msg(f" Too many lines ({len(_)}) > {len_max}, alignment op halted, sorry.", "info")
|
114 |
|
115 |
_ = zip_longest(_, [""])
|
116 |
_ = pd.DataFrame(_, columns=["text1", "text2"])
|
@@ -167,8 +169,12 @@ def gradiobee(
|
|
167 |
|
168 |
# exit if there are too many lines
|
169 |
len12 = len(list1) + len(list2)
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
172 |
|
173 |
file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
|
174 |
file_dl_xlsx = Path(
|
@@ -201,9 +207,15 @@ def gradiobee(
|
|
201 |
return error_msg(exc)
|
202 |
# slow track
|
203 |
else:
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
205 |
msg = (
|
206 |
-
"
|
|
|
207 |
"and will hog this experimental server and hinder "
|
208 |
"other users from trying the service. "
|
209 |
"Aborted...sorry"
|
@@ -323,6 +335,7 @@ def gradiobee(
|
|
323 |
fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
|
324 |
|
325 |
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
|
|
|
326 |
# _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
|
327 |
_x = ~_
|
328 |
|
|
|
83 |
if file2 is None:
|
84 |
logger.debug("file2 is None")
|
85 |
text2 = ""
|
|
|
|
|
86 |
else:
|
87 |
logger.debug("file2.name: %s", file2.name)
|
88 |
text2 = file2text(file2)
|
|
|
106 |
if not _: # essentially empty file1
|
107 |
return error_msg("Nothing worthy of processing in file 1")
|
108 |
|
109 |
+
logger.info(
|
110 |
+
"fast track single fiel: len %1, max %s",
|
111 |
+
len(_), 2 * len_max
|
112 |
+
)
|
113 |
# exit if there are too many lines
|
114 |
+
if len(_) > 2 * len_max:
|
115 |
+
return error_msg(f" Too many lines ({len(_)}) > {2 * len_max}, alignment op halted, sorry.", "info")
|
116 |
|
117 |
_ = zip_longest(_, [""])
|
118 |
_ = pd.DataFrame(_, columns=["text1", "text2"])
|
|
|
169 |
|
170 |
# exit if there are too many lines
|
171 |
len12 = len(list1) + len(list2)
|
172 |
+
logger.info(
|
173 |
+
"fast track: len1 %s, len2 %s, tot %s, max %s",
|
174 |
+
len(list1), len(list2), len(list1) + len(list2), 3 * len_max
|
175 |
+
)
|
176 |
+
if len12 > 3 * len_max:
|
177 |
+
return error_msg(f" Too many lines ({len(list1)} + {len(list2)} > {3 * len_max}), alignment op halted, sorry.", "info")
|
178 |
|
179 |
file_dl = Path(f"{Path(file1.name).stem[:-8]}-{Path(file2.name).stem[:-8]}.csv")
|
180 |
file_dl_xlsx = Path(
|
|
|
207 |
return error_msg(exc)
|
208 |
# slow track
|
209 |
else:
|
210 |
+
logger.info(
|
211 |
+
"slow track: len1 %s, len2 %s, tot: %s, max %s",
|
212 |
+
len(list1), len(list2), len(list1) + len(list2),
|
213 |
+
3 * len_max
|
214 |
+
)
|
215 |
+
if len(list1) + len(list2) > 3 * len_max:
|
216 |
msg = (
|
217 |
+
f" len1 {len(list1)} + len2 {len(list2)} > {3 * len_max}. "
|
218 |
+
"This will take too long to complete "
|
219 |
"and will hog this experimental server and hinder "
|
220 |
"other users from trying the service. "
|
221 |
"Aborted...sorry"
|
|
|
335 |
fig.suptitle(f"alignment projection\n(eps={eps}, min_samples={min_samples})")
|
336 |
|
337 |
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1
|
338 |
+
|
339 |
# _x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0
|
340 |
_x = ~_
|
341 |
|
tests/test_lists2cmat_hlm.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Test lists2cmat."""
|
2 |
+
# pylint: disable=invalid-name
|
3 |
+
|
4 |
+
from itertools import zip_longest
|
5 |
+
from fastlid import fastlid
|
6 |
+
from radiobee.loadtext import loadtext
|
7 |
+
from radiobee.lists2cmat import lists2cmat
|
8 |
+
|
9 |
+
file1 = "data/test_en.txt"
|
10 |
+
file2 = "data/test_zh.txt"
|
11 |
+
file1 = "data/hlm-ch1-en.txt"
|
12 |
+
file2 = "data/hlm-ch1-zh.txt"
|
13 |
+
|
14 |
+
# assume English or Chinese
|
15 |
+
fastlid.set_languages = ["en", "zh", ]
|
16 |
+
|
17 |
+
text1 = loadtext(file1)
|
18 |
+
text2 = loadtext(file2)
|
19 |
+
|
20 |
+
lang1, _ = fastlid(text1)
|
21 |
+
lang2, _ = fastlid(text2)
|
22 |
+
|
23 |
+
|
24 |
+
def test_lists2cmat_hlm():
|
25 |
+
"""Test lists2cmat."""
|
26 |
+
|
27 |
+
lst1, lst2 = [], []
|
28 |
+
|
29 |
+
if text1:
|
30 |
+
lst1 = [_.strip() for _ in text1.splitlines() if _.strip()]
|
31 |
+
if text2:
|
32 |
+
lst2 = [_.strip() for _ in text2.splitlines() if _.strip()]
|
33 |
+
|
34 |
+
# en zh
|
35 |
+
len(lst1) == 135, len(lst2) == 55
|
36 |
+
|
37 |
+
# cmat = texts2cmat(lst1, lst2, lang1, lang2)
|
38 |
+
cmat = lists2cmat(lst1, lst2, lang1, lang2)
|
39 |
+
|
40 |
+
assert cmat.shape == (36, 33)
|
41 |
+
|
42 |
+
cmat21 = lists2cmat(lst2, lst1, lang2, lang1)
|
43 |
+
|
44 |
+
assert cmat21.shape == (33, 36)
|
45 |
+
assert lists2cmat(lst2, lst1).mean() > 0.05 # 0.09
|