cointegrated commited on
Commit
46fe50f
·
1 Parent(s): c45b82d

detect the script automatically, if not specified

Browse files
Files changed (3) hide show
  1. app.py +9 -6
  2. myv_translit.py +36 -1
  3. test_translit.py +10 -1
app.py CHANGED
@@ -1,12 +1,17 @@
1
  import gradio as gr
2
 
3
- from myv_translit import lat2cyr, cyr2lat
4
 
 
5
 
6
- def transliterator(input_text, direction_to_latn=1, joint_acute=True, not_first_e_with_hacek=False, not_soft_l_after_vowels=True):
 
7
  first_e_with_hacek = not not_first_e_with_hacek
8
  soft_l_after_vowels = not not_soft_l_after_vowels
9
- if direction_to_latn:
 
 
 
10
  result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
11
  else:
12
  result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
@@ -21,14 +26,12 @@ article = """
21
  - http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
22
  """
23
 
24
- directions = ['lat -> кир', 'кир -> lat']
25
-
26
 
27
  interface = gr.Interface(
28
  transliterator,
29
  [
30
  gr.Textbox(label="Текст", lines=2, placeholder='text to transliterate'),
31
- gr.Radio(choices=directions, type="index", interactive=True, label='Направление'),
32
  gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
33
  gr.Checkbox(value=False, label='ěrzä -> erzä'),
34
  gr.Checkbox(value=False, label='peĺks -> pelks'),
 
1
  import gradio as gr
2
 
3
+ from myv_translit import lat2cyr, cyr2lat, detect_script
4
 
5
+ DIRECTIONS = ['lat -> кир', 'кир -> lat']
6
 
7
+
8
+ def transliterator(input_text, direction, joint_acute=True, not_first_e_with_hacek=False, not_soft_l_after_vowels=True):
9
  first_e_with_hacek = not not_first_e_with_hacek
10
  soft_l_after_vowels = not not_soft_l_after_vowels
11
+ if direction is None:
12
+ code = detect_script(input_text)
13
+ direction = DIRECTIONS[int(code != 'lat')]
14
+ if direction == DIRECTIONS[1]:
15
  result = cyr2lat(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
16
  else:
17
  result = lat2cyr(input_text, joint_acute=joint_acute, first_e_with_hacek=first_e_with_hacek, soft_l_after_vowels=soft_l_after_vowels)
 
26
  - http://valks.erzja.info/2020/04/30/эрзянский-алфавит/
27
  """
28
 
 
 
29
 
30
  interface = gr.Interface(
31
  transliterator,
32
  [
33
  gr.Textbox(label="Текст", lines=2, placeholder='text to transliterate'),
34
+ gr.Radio(choices=DIRECTIONS, type="value", interactive=True, label='Направление'),
35
  gr.Checkbox(value=True, label='L + ́ -> Ĺ'),
36
  gr.Checkbox(value=False, label='ěrzä -> erzä'),
37
  gr.Checkbox(value=False, label='peĺks -> pelks'),
myv_translit.py CHANGED
@@ -1,5 +1,5 @@
1
  import re
2
-
3
 
4
  _cyr2lat = [
5
  {'find_what': 'А', 'replacer': 'A', 're': False},
@@ -265,3 +265,38 @@ def cyr2lat(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels
265
  def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
266
  # todo: support all the optional settings
267
  return transliterate_with_rules(text, _lat2cyr)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ from collections import Counter
3
 
4
  _cyr2lat = [
5
  {'find_what': 'А', 'replacer': 'A', 're': False},
 
265
  def lat2cyr(text, joint_acute=True, first_e_with_hacek=True, soft_l_after_vowels=True):
266
  # todo: support all the optional settings
267
  return transliterate_with_rules(text, _lat2cyr)
268
+
269
+
270
+ CYR_CHARS = 'абвгдеёжзиклмнопрстуфхцчшщъыьэюя'
271
+ BASIC_LAT_CHARS = 'abcdefghijklmnopqrtuvwxyz'
272
+ ACCENT_LAT_CHARS = 'ěäüöśźćńŕťďĺ'
273
+ LAT_CHARS = BASIC_LAT_CHARS + ACCENT_LAT_CHARS
274
+
275
+
276
+ def detect_script(text: str, min_prevalence: float = 2.0, min_detectable: float = 0.1) -> str:
277
+ """ Detect the script of the text.
278
+ Possible values:
279
+ - cyr - Cyrillic
280
+ - lat - Latin
281
+ - mix - Mixed Cyrillic and Latin script
282
+ - unk - Unknown script (probably neither Latin nor Cyrillic)
283
+ """
284
+ cyr, lat, other = 0, 0, 0
285
+ char_cnt = Counter(text.lower())
286
+ for char, cnt in char_cnt.items():
287
+ if char in CYR_CHARS:
288
+ cyr += cnt
289
+ elif char in LAT_CHARS:
290
+ lat += cnt
291
+ else:
292
+ other += cnt
293
+ total = cyr + lat + other
294
+ if not total:
295
+ return 'unk'
296
+ if (cyr + lat) / total < min_detectable:
297
+ return 'unk'
298
+ if cyr >= lat * min_prevalence:
299
+ return 'cyr'
300
+ if lat >= cyr * min_prevalence:
301
+ return 'lat'
302
+ return 'mix'
test_translit.py CHANGED
@@ -1,4 +1,4 @@
1
- from myv_translit import cyr2lat
2
 
3
 
4
  def test_join_acute():
@@ -18,6 +18,15 @@ def test_soft_l():
18
  assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
19
 
20
 
 
 
 
 
 
 
 
 
 
21
  # todo: test on a larger corpus
22
  # todo: test cyclical consistency
23
 
 
1
+ from myv_translit import cyr2lat, detect_script
2
 
3
 
4
  def test_join_acute():
 
18
  assert cyr2lat('пелькс', soft_l_after_vowels=False) == 'pelks'
19
 
20
 
21
+ def test_detection():
22
+ assert detect_script('123 456?? 8743 098543 ???...,.! @%%&&& хз') == 'unk'
23
+ assert detect_script('ěrzä') == 'lat'
24
+ assert detect_script('ěrzä ю') == 'lat'
25
+ assert detect_script('ЭРЗЯ') == 'cyr'
26
+ assert detect_script('ЭРЗЯ d') == 'cyr'
27
+ assert detect_script('ěrzä эрзянь') == 'mix'
28
+
29
+
30
  # todo: test on a larger corpus
31
  # todo: test cyclical consistency
32