Mahiruoshi commited on
Commit
361bf5f
1 Parent(s): d67d0bf

Upload 11 files

Browse files
text/__init__.py CHANGED
@@ -1,14 +1,8 @@
1
  """ from https://github.com/keithito/tacotron """
2
  from text import cleaners
3
- from text.symbols import symbols
4
 
5
 
6
- # Mappings from symbol to numeric ID and vice versa:
7
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
- _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
-
10
-
11
- def text_to_sequence(text, cleaner_names):
12
  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
  Args:
14
  text: string to convert to a sequence
@@ -16,6 +10,8 @@ def text_to_sequence(text, cleaner_names):
16
  Returns:
17
  List of integers corresponding to the symbols in the text
18
  '''
 
 
19
  sequence = []
20
 
21
  clean_text = _clean_text(text, cleaner_names)
@@ -27,26 +23,6 @@ def text_to_sequence(text, cleaner_names):
27
  return sequence
28
 
29
 
30
- def cleaned_text_to_sequence(cleaned_text):
31
- '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
32
- Args:
33
- text: string to convert to a sequence
34
- Returns:
35
- List of integers corresponding to the symbols in the text
36
- '''
37
- sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
38
- return sequence
39
-
40
-
41
- def sequence_to_text(sequence):
42
- '''Converts a sequence of IDs back to a string'''
43
- result = ''
44
- for symbol_id in sequence:
45
- s = _id_to_symbol[symbol_id]
46
- result += s
47
- return result
48
-
49
-
50
  def _clean_text(text, cleaner_names):
51
  for name in cleaner_names:
52
  cleaner = getattr(cleaners, name)
 
1
  """ from https://github.com/keithito/tacotron """
2
  from text import cleaners
 
3
 
4
 
5
+ def text_to_sequence(text, symbols, cleaner_names):
 
 
 
 
 
6
  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
7
  Args:
8
  text: string to convert to a sequence
 
10
  Returns:
11
  List of integers corresponding to the symbols in the text
12
  '''
13
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
14
+
15
  sequence = []
16
 
17
  clean_text = _clean_text(text, cleaner_names)
 
23
  return sequence
24
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def _clean_text(text, cleaner_names):
27
  for name in cleaner_names:
28
  cleaner = getattr(cleaners, name)
text/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (1.23 kB). View file
 
text/__pycache__/cleaners.cpython-39.pyc ADDED
Binary file (3.74 kB). View file
 
text/__pycache__/japanese.cpython-39.pyc ADDED
Binary file (4.43 kB). View file
 
text/__pycache__/mandarin.cpython-39.pyc ADDED
Binary file (6.4 kB). View file
 
text/cleaners.py CHANGED
@@ -1,21 +1,33 @@
1
  import re
2
- from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
- from text.japanese import clean_japanese, japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
4
  from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
5
 
6
- def none_cleaner(text):
7
- return text
8
-
9
  def japanese_cleaners(text):
10
- text = clean_japanese(text)
11
- text = re.sub(r'([A-Za-z])$', r'\1.', text)
 
 
12
  return text
13
 
 
14
  def japanese_cleaners2(text):
15
  return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
16
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def chinese_cleaners(text):
18
  '''Pipeline for Chinese text'''
 
19
  text = number_to_chinese(text)
20
  text = chinese_to_bopomofo(text)
21
  text = latin_to_bopomofo(text)
@@ -23,7 +35,10 @@ def chinese_cleaners(text):
23
  text += '。'
24
  return text
25
 
 
26
  def zh_ja_mixture_cleaners(text):
 
 
27
  chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
28
  japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
29
  for chinese_text in chinese_texts:
@@ -38,25 +53,53 @@ def zh_ja_mixture_cleaners(text):
38
  text += '.'
39
  return text
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def cjke_cleaners(text):
42
  chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
43
  japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
44
- english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
45
  for chinese_text in chinese_texts:
46
  cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
47
  cleaned_text = cleaned_text.replace(
48
  'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
49
  text = text.replace(chinese_text, cleaned_text+' ', 1)
50
  for japanese_text in japanese_texts:
51
- cleaned_text = clean_japanese(japanese_text[4:-4])
52
  cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
53
  'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
54
  text = text.replace(japanese_text, cleaned_text+' ', 1)
55
- for english_text in english_texts:
56
- cleaned_text = english_to_ipa2(english_text[4:-4])
57
- cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
58
- 'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
59
- text = text.replace(english_text, cleaned_text+' ', 1)
60
  text = text[:-1]
61
  if re.match(r'[^\.,!\?\-…~]', text[-1]):
62
  text += '.'
 
1
  import re
2
+ from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
 
3
  from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
 
 
 
 
5
  def japanese_cleaners(text):
6
+ from text.japanese import japanese_to_romaji_with_accent
7
+ text = japanese_to_romaji_with_accent(text)
8
+ if re.match('[A-Za-z]', text[-1]):
9
+ text += '.'
10
  return text
11
 
12
+
13
  def japanese_cleaners2(text):
14
  return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
15
 
16
+
17
+ def korean_cleaners(text):
18
+ '''Pipeline for Korean text'''
19
+ from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
20
+ text = latin_to_hangul(text)
21
+ text = number_to_hangul(text)
22
+ text = divide_hangul(text)
23
+ if re.match('[\u3131-\u3163]', text[-1]):
24
+ text += '.'
25
+ return text
26
+
27
+
28
  def chinese_cleaners(text):
29
  '''Pipeline for Chinese text'''
30
+ from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
31
  text = number_to_chinese(text)
32
  text = chinese_to_bopomofo(text)
33
  text = latin_to_bopomofo(text)
 
35
  text += '。'
36
  return text
37
 
38
+
39
  def zh_ja_mixture_cleaners(text):
40
+ from text.mandarin import chinese_to_romaji
41
+ from text.japanese import japanese_to_romaji_with_accent
42
  chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
43
  japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
44
  for chinese_text in chinese_texts:
 
53
  text += '.'
54
  return text
55
 
56
+
57
+ def sanskrit_cleaners(text):
58
+ text = text.replace('॥', '।').replace('ॐ', 'ओम्')
59
+ if text[-1] != '।':
60
+ text += ' ।'
61
+ return text
62
+
63
+
64
+ def cjks_cleaners(text):
65
+ from text.mandarin import chinese_to_lazy_ipa
66
+ from text.japanese import japanese_to_ipa
67
+ from text.korean import korean_to_lazy_ipa
68
+ from text.sanskrit import devanagari_to_ipa
69
+ chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
70
+ japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
71
+ korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
72
+ sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
73
+ for chinese_text in chinese_texts:
74
+ cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
75
+ text = text.replace(chinese_text, cleaned_text+' ', 1)
76
+ for japanese_text in japanese_texts:
77
+ cleaned_text = japanese_to_ipa(japanese_text[4:-4])
78
+ text = text.replace(japanese_text, cleaned_text+' ', 1)
79
+ for korean_text in korean_texts:
80
+ cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
81
+ text = text.replace(korean_text, cleaned_text+' ', 1)
82
+ for sanskrit_text in sanskrit_texts:
83
+ cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
84
+ text = text.replace(sanskrit_text, cleaned_text+' ', 1)
85
+ text = text[:-1]
86
+ if re.match(r'[^\.,!\?\-…~]', text[-1]):
87
+ text += '.'
88
+ return text
89
+
90
  def cjke_cleaners(text):
91
  chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
92
  japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
 
93
  for chinese_text in chinese_texts:
94
  cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
95
  cleaned_text = cleaned_text.replace(
96
  'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
97
  text = text.replace(chinese_text, cleaned_text+' ', 1)
98
  for japanese_text in japanese_texts:
99
+ cleaned_text = japanese_to_ipa(japanese_text[4:-4])
100
  cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
101
  'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
102
  text = text.replace(japanese_text, cleaned_text+' ', 1)
 
 
 
 
 
103
  text = text[:-1]
104
  if re.match(r'[^\.,!\?\-…~]', text[-1]):
105
  text += '.'
text/japanese.py CHANGED
@@ -1,18 +1,6 @@
1
  import re
2
  from unidecode import unidecode
3
- from unidecode import unidecode
4
- import ctypes
5
-
6
- dll = ctypes.cdll.LoadLibrary('cleaners/JapaneseCleaner.dll')
7
- dll.CreateOjt.restype = ctypes.c_uint64
8
- dll.PluginMain.restype = ctypes.c_uint64
9
- floder = ctypes.create_unicode_buffer("cleaners")
10
- dll.CreateOjt(floder)
11
-
12
- def clean_japanese(text):
13
- input_wchar_pointer = ctypes.create_unicode_buffer(text)
14
- result = ctypes.wstring_at(dll.PluginMain(input_wchar_pointer))
15
- return result
16
 
17
 
18
  # Regular expression matching Japanese without punctuation marks:
 
1
  import re
2
  from unidecode import unidecode
3
+ import pyopenjtalk
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
  # Regular expression matching Japanese without punctuation marks: