Spaces:
Sleeping
Sleeping
Commit
·
46fe50f
1
Parent(s):
c45b82d
detect the script automatically, if not specified
Browse files- app.py +9 -6
- myv_translit.py +36 -1
- test_translit.py +10 -1
app.py
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
from myv_translit import lat2cyr, cyr2lat
|
4 |
|
|
|
5 |
|
6 |
-
|
|
|
7 |
first_e_with_hacek = not not_first_e_with_hacek
|
8 |
soft_l_after_vowels = not not_soft_l_after_vowels
|
9 |
-
if
|
|
|
|
|
|
|
10 |
result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
11 |
else:
|
12 |
result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
@@ -21,14 +26,12 @@ article = """
|
|
21 |
- http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
|
22 |
"""
|
23 |
|
24 |
-
directions = ['lat -> кир', 'кир -> lat']
|
25 |
-
|
26 |
|
27 |
interface = gr.Interface(
|
28 |
transliterator,
|
29 |
[
|
30 |
gr.Textbox(label="Текст", lines=2, placeholder='text to transliterate'),
|
31 |
-
gr.Radio(choices=
|
32 |
gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
|
33 |
gr.Checkbox(value=False, label='ěrzä -> erzä'),
|
34 |
gr.Checkbox(value=False, label='peĺks -> pelks'),
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from myv_translit import lat2cyr, cyr2lat, detect_script
|
4 |
|
5 |
+
DIRECTIONS = ['lat -> кир', 'кир -> lat']
|
6 |
|
7 |
+
|
8 |
+
def transliterator(input_text, direction, joint_acute=True, not_first_e_with_hacek=False, not_soft_l_after_vowels=True):
|
9 |
first_e_with_hacek = not not_first_e_with_hacek
|
10 |
soft_l_after_vowels = not not_soft_l_after_vowels
|
11 |
+
if direction is None:
|
12 |
+
code = detect_script(input_text)
|
13 |
+
direction = DIRECTIONS[int(code != 'lat')]
|
14 |
+
if direction == DIRECTIONS[1]:
|
15 |
result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
16 |
else:
|
17 |
result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
|
|
|
26 |
- http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
|
27 |
"""
|
28 |
|
|
|
|
|
29 |
|
30 |
interface = gr.Interface(
|
31 |
transliterator,
|
32 |
[
|
33 |
gr.Textbox(label="Текст", lines=2, placeholder='text to transliterate'),
|
34 |
+
gr.Radio(choices=DIRECTIONS, type="value", interactive=True, label='Направление'),
|
35 |
gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
|
36 |
gr.Checkbox(value=False, label='ěrzä -> erzä'),
|
37 |
gr.Checkbox(value=False, label='peĺks -> pelks'),
|
myv_translit.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import re
|
2 |
-
|
3 |
|
4 |
_cyr2lat = [
|
5 |
{'find_what': 'А', 'replacer': 'A', 're': False},
|
@@ -265,3 +265,38 @@ def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels
|
|
265 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
266 |
# todo: support all the optional settings
|
267 |
return transliterate_with_rules(text, _lat2cyr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import re
|
2 |
+
from collections import Counter
|
3 |
|
4 |
_cyr2lat = [
|
5 |
{'find_what': 'А', 'replacer': 'A', 're': False},
|
|
|
265 |
def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
|
266 |
# todo: support all the optional settings
|
267 |
return transliterate_with_rules(text, _lat2cyr)
|
268 |
+
|
269 |
+
|
270 |
+
CYR_CHARS = 'абвгдеёжзиклмнопрстуфхцчшщъыьэюя'
|
271 |
+
BASIC_LAT_CHARS = 'abcdefghijklmnopqrtuvwxyz'
|
272 |
+
ACCENT_LAT_CHARS = 'ěäüöśźćńŕťďĺ'
|
273 |
+
LAT_CHARS = BASIC_LAT_CHARS + ACCENT_LAT_CHARS
|
274 |
+
|
275 |
+
|
276 |
+
def detect_script(text: str, min_prevalence: float = 2.0, min_detectable: float = 0.1) -> str:
|
277 |
+
""" Detect the script of the text.
|
278 |
+
Possible values:
|
279 |
+
- cyr - Cyrillic
|
280 |
+
- lat - Latin
|
281 |
+
- mix - Mixed Cyrillic and Latin script
|
282 |
+
- unk - Unknown script (probably neither Latin nor Cyrillic)
|
283 |
+
"""
|
284 |
+
cyr, lat, other = 0, 0, 0
|
285 |
+
char_cnt = Counter(text.lower())
|
286 |
+
for char, cnt in char_cnt.items():
|
287 |
+
if char in CYR_CHARS:
|
288 |
+
cyr += cnt
|
289 |
+
elif char in LAT_CHARS:
|
290 |
+
lat += cnt
|
291 |
+
else:
|
292 |
+
other += cnt
|
293 |
+
total = cyr + lat + other
|
294 |
+
if not total:
|
295 |
+
return 'unk'
|
296 |
+
if (cyr + lat) / total < min_detectable:
|
297 |
+
return 'unk'
|
298 |
+
if cyr >= lat * min_prevalence:
|
299 |
+
return 'cyr'
|
300 |
+
if lat >= cyr * min_prevalence:
|
301 |
+
return 'lat'
|
302 |
+
return 'mix'
|
test_translit.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from myv_translit import cyr2lat
|
2 |
|
3 |
|
4 |
def test_join_acute():
|
@@ -18,6 +18,15 @@ def test_soft_l():
|
|
18 |
assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# todo: test on a larger corpus
|
22 |
# todo: test cyclical consistency
|
23 |
|
|
|
1 |
+
from myv_translit import cyr2lat, detect_script
|
2 |
|
3 |
|
4 |
def test_join_acute():
|
|
|
18 |
assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
|
19 |
|
20 |
|
21 |
+
def test_detection():
|
22 |
+
assert detect_script('123 456?? 8743 098543 ???...,.! @%%&&& хз') == 'unk'
|
23 |
+
assert detect_script('ěrzä') == 'lat'
|
24 |
+
assert detect_script('ěrzä ю') == 'lat'
|
25 |
+
assert detect_script('ЭРЗЯ') == 'cyr'
|
26 |
+
assert detect_script('ЭРЗЯ d') == 'cyr'
|
27 |
+
assert detect_script('ěrzä эрзянь') == 'mix'
|
28 |
+
|
29 |
+
|
30 |
# todo: test on a larger corpus
|
31 |
# todo: test cyclical consistency
|
32 |
|