Mahiruoshi commited on
Commit
aa72a6f
·
1 Parent(s): 01b6d61

Upload 26 files

Browse files
text/__pycache__/__init__.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/__init__.cpython-38.pyc and b/text/__pycache__/__init__.cpython-38.pyc differ
 
text/__pycache__/cleaners.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/cleaners.cpython-38.pyc and b/text/__pycache__/cleaners.cpython-38.pyc differ
 
text/__pycache__/japanese.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/japanese.cpython-38.pyc and b/text/__pycache__/japanese.cpython-38.pyc differ
 
text/__pycache__/korean.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/korean.cpython-38.pyc and b/text/__pycache__/korean.cpython-38.pyc differ
 
text/__pycache__/mandarin.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/mandarin.cpython-38.pyc and b/text/__pycache__/mandarin.cpython-38.pyc differ
 
text/__pycache__/sanskrit.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/sanskrit.cpython-38.pyc and b/text/__pycache__/sanskrit.cpython-38.pyc differ
 
text/__pycache__/shanghainese.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/shanghainese.cpython-38.pyc and b/text/__pycache__/shanghainese.cpython-38.pyc differ
 
text/__pycache__/symbols.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/symbols.cpython-38.pyc and b/text/__pycache__/symbols.cpython-38.pyc differ
 
text/__pycache__/thai.cpython-38.pyc CHANGED
Binary files a/text/__pycache__/thai.cpython-38.pyc and b/text/__pycache__/thai.cpython-38.pyc differ
 
text/cleaners.py CHANGED
@@ -1,11 +1,19 @@
1
  import re
 
2
  from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
3
  from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
- from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
 
 
 
 
 
 
5
 
6
  def japanese_cleaners(text):
7
  text = japanese_to_romaji_with_accent(text)
8
- text = re.sub(r'([A-Za-z])$', r'\1.', text)
 
9
  return text
10
 
11
 
@@ -18,7 +26,8 @@ def korean_cleaners(text):
18
  text = latin_to_hangul(text)
19
  text = number_to_hangul(text)
20
  text = divide_hangul(text)
21
- text = re.sub(r'([\u3131-\u3163])$', r'\1.', text)
 
22
  return text
23
 
24
 
@@ -27,39 +36,58 @@ def chinese_cleaners(text):
27
  text = number_to_chinese(text)
28
  text = chinese_to_bopomofo(text)
29
  text = latin_to_bopomofo(text)
30
- text = re.sub(r'([ˉˊˇˋ˙])$', r'\1。', text)
 
31
  return text
32
 
33
 
34
  def zh_ja_mixture_cleaners(text):
35
- text = re.sub(r'\[ZH\](.*?)\[ZH\]',
36
- lambda x: chinese_to_romaji(x.group(1))+' ', text)
37
- text = re.sub(r'\[JA\](.*?)\[JA\]', lambda x: japanese_to_romaji_with_accent(
38
- x.group(1)).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')+' ', text)
39
- text = re.sub(r'\s+$', '', text)
40
- text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
41
  return text
42
 
43
 
44
  def sanskrit_cleaners(text):
45
  text = text.replace('॥', '।').replace('ॐ', 'ओम्')
46
- text = re.sub(r'([^।])$', r'\1।', text)
 
47
  return text
48
 
49
 
50
  def cjks_cleaners(text):
51
- text = re.sub(r'\[ZH\](.*?)\[ZH\]',
52
- lambda x: chinese_to_lazy_ipa(x.group(1))+' ', text)
53
- text = re.sub(r'\[JA\](.*?)\[JA\]',
54
- lambda x: japanese_to_ipa(x.group(1))+' ', text)
55
- text = re.sub(r'\[KO\](.*?)\[KO\]',
56
- lambda x: korean_to_lazy_ipa(x.group(1))+' ', text)
57
- text = re.sub(r'\[SA\](.*?)\[SA\]',
58
- lambda x: devanagari_to_ipa(x.group(1))+' ', text)
59
- text = re.sub(r'\[EN\](.*?)\[EN\]',
60
- lambda x: english_to_lazy_ipa(x.group(1))+' ', text)
61
- text = re.sub(r'\s+$', '', text)
62
- text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
 
 
 
 
 
 
 
 
 
 
 
63
  return text
64
 
65
 
@@ -91,3 +119,58 @@ def cjke_cleaners(text):
91
  text += '.'
92
  return text
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import re
2
+ from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
  from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
4
  from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
5
+ # from text.sanskrit import devanagari_to_ipa
6
+ # from text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
7
+ # from text.thai import num_to_thai, latin_to_thai
8
+ # from text.shanghainese import shanghainese_to_ipa
9
+ # from text.cantonese import cantonese_to_ipa
10
+ # from text.ngu_dialect import ngu_dialect_to_ipa
11
+
12
 
13
  def japanese_cleaners(text):
14
  text = japanese_to_romaji_with_accent(text)
15
+ if re.match('[A-Za-z]', text[-1]):
16
+ text += '.'
17
  return text
18
 
19
 
 
26
  text = latin_to_hangul(text)
27
  text = number_to_hangul(text)
28
  text = divide_hangul(text)
29
+ if re.match('[\u3131-\u3163]', text[-1]):
30
+ text += '.'
31
  return text
32
 
33
 
 
36
  text = number_to_chinese(text)
37
  text = chinese_to_bopomofo(text)
38
  text = latin_to_bopomofo(text)
39
+ if re.match('[ˉˊˇˋ˙]', text[-1]):
40
+ text += '。'
41
  return text
42
 
43
 
44
  def zh_ja_mixture_cleaners(text):
45
+ chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
46
+ japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
47
+ for chinese_text in chinese_texts:
48
+ cleaned_text = chinese_to_romaji(chinese_text[4:-4])
49
+ text = text.replace(chinese_text, cleaned_text+' ', 1)
50
+ for japanese_text in japanese_texts:
51
+ cleaned_text = japanese_to_romaji_with_accent(
52
+ japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
53
+ text = text.replace(japanese_text, cleaned_text+' ', 1)
54
+ text = text[:-1]
55
+ if re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
56
+ text += '.'
57
  return text
58
 
59
 
60
  def sanskrit_cleaners(text):
61
  text = text.replace('॥', '।').replace('ॐ', 'ओम्')
62
+ if text[-1] != '।':
63
+ text += ' ।'
64
  return text
65
 
66
 
67
  def cjks_cleaners(text):
68
+ chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
69
+ japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
70
+ korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
71
+ sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
72
+ english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
73
+ for chinese_text in chinese_texts:
74
+ cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
75
+ text = text.replace(chinese_text, cleaned_text+' ', 1)
76
+ for japanese_text in japanese_texts:
77
+ cleaned_text = japanese_to_ipa(japanese_text[4:-4])
78
+ text = text.replace(japanese_text, cleaned_text+' ', 1)
79
+ for korean_text in korean_texts:
80
+ cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
81
+ text = text.replace(korean_text, cleaned_text+' ', 1)
82
+ for sanskrit_text in sanskrit_texts:
83
+ cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
84
+ text = text.replace(sanskrit_text, cleaned_text+' ', 1)
85
+ for english_text in english_texts:
86
+ cleaned_text = english_to_lazy_ipa(english_text[4:-4])
87
+ text = text.replace(english_text, cleaned_text+' ', 1)
88
+ text = text[:-1]
89
+ if re.match(r'[^\.,!\?\-…~]', text[-1]):
90
+ text += '.'
91
  return text
92
 
93
 
 
119
  text += '.'
120
  return text
121
 
122
+
123
+ def cjke_cleaners2(text):
124
+ chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
125
+ japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
126
+ korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
127
+ english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
128
+ for chinese_text in chinese_texts:
129
+ cleaned_text = chinese_to_ipa(chinese_text[4:-4])
130
+ text = text.replace(chinese_text, cleaned_text+' ', 1)
131
+ for japanese_text in japanese_texts:
132
+ cleaned_text = japanese_to_ipa2(japanese_text[4:-4])
133
+ text = text.replace(japanese_text, cleaned_text+' ', 1)
134
+ for korean_text in korean_texts:
135
+ cleaned_text = korean_to_ipa(korean_text[4:-4])
136
+ text = text.replace(korean_text, cleaned_text+' ', 1)
137
+ for english_text in english_texts:
138
+ cleaned_text = english_to_ipa2(english_text[4:-4])
139
+ text = text.replace(english_text, cleaned_text+' ', 1)
140
+ text = text[:-1]
141
+ if re.match(r'[^\.,!\?\-…~]', text[-1]):
142
+ text += '.'
143
+ return text
144
+
145
+
146
+ def thai_cleaners(text):
147
+ text = num_to_thai(text)
148
+ text = latin_to_thai(text)
149
+ return text
150
+
151
+
152
+ def shanghainese_cleaners(text):
153
+ text = shanghainese_to_ipa(text)
154
+ if re.match(r'[^\.,!\?\-…~]', text[-1]):
155
+ text += '.'
156
+ return text
157
+
158
+
159
+ def chinese_dialect_cleaners(text):
160
+ text = re.sub(r'\[MD\](.*?)\[MD\]',
161
+ lambda x: chinese_to_ipa2(x.group(1))+' ', text)
162
+ text = re.sub(r'\[TW\](.*?)\[TW\]',
163
+ lambda x: chinese_to_ipa2(x.group(1), True)+' ', text)
164
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
165
+ lambda x: japanese_to_ipa3(x.group(1)).replace('Q', 'ʔ')+' ', text)
166
+ text = re.sub(r'\[SH\](.*?)\[SH\]', lambda x: shanghainese_to_ipa(x.group(1)).replace('1', '˥˧').replace('5',
167
+ '˧˧˦').replace('6', '˩˩˧').replace('7', '˥').replace('8', '˩˨').replace('ᴀ', 'ɐ').replace('ᴇ', 'e')+' ', text)
168
+ text = re.sub(r'\[GD\](.*?)\[GD\]',
169
+ lambda x: cantonese_to_ipa(x.group(1))+' ', text)
170
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
171
+ lambda x: english_to_lazy_ipa2(x.group(1))+' ', text)
172
+ text = re.sub(r'\[([A-Z]{2})\](.*?)\[\1\]', lambda x: ngu_dialect_to_ipa(x.group(2), x.group(
173
+ 1)).replace('ʣ', 'dz').replace('ʥ', 'dʑ').replace('ʦ', 'ts').replace('ʨ', 'tɕ')+' ', text)
174
+ text = re.sub(r'\s+$', '', text)
175
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
176
+ return text
text/mandarin.py CHANGED
@@ -4,7 +4,6 @@ import re
4
  from pypinyin import lazy_pinyin, BOPOMOFO
5
  import jieba
6
  import cn2an
7
- import logging
8
 
9
 
10
  # List of (Latin alphabet, bopomofo) pairs:
@@ -240,7 +239,7 @@ def number_to_chinese(text):
240
  return text
241
 
242
 
243
- def chinese_to_bopomofo(text):
244
  text = text.replace('、', ',').replace(';', ',').replace(':', ',')
245
  words = jieba.lcut(text, cut_all=False)
246
  text = ''
@@ -253,7 +252,10 @@ def chinese_to_bopomofo(text):
253
  bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
254
  if text != '':
255
  text += ' '
256
- text += ''.join(bopomofos)
 
 
 
257
  return text
258
 
259
 
@@ -314,9 +316,9 @@ def chinese_to_ipa(text):
314
  return text
315
 
316
 
317
- def chinese_to_ipa2(text):
318
  text = number_to_chinese(text)
319
- text = chinese_to_bopomofo(text)
320
  text = latin_to_bopomofo(text)
321
  text = bopomofo_to_ipa2(text)
322
  text = re.sub(r'i([aoe])', r'j\1', text)
 
4
  from pypinyin import lazy_pinyin, BOPOMOFO
5
  import jieba
6
  import cn2an
 
7
 
8
 
9
  # List of (Latin alphabet, bopomofo) pairs:
 
239
  return text
240
 
241
 
242
+ def chinese_to_bopomofo(text, taiwanese=False):
243
  text = text.replace('、', ',').replace(';', ',').replace(':', ',')
244
  words = jieba.lcut(text, cut_all=False)
245
  text = ''
 
252
  bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
253
  if text != '':
254
  text += ' '
255
+ if taiwanese:
256
+ text += '#'+'#'.join(bopomofos)
257
+ else:
258
+ text += ''.join(bopomofos)
259
  return text
260
 
261
 
 
316
  return text
317
 
318
 
319
+ def chinese_to_ipa2(text, taiwanese=False):
320
  text = number_to_chinese(text)
321
+ text = chinese_to_bopomofo(text, taiwanese)
322
  text = latin_to_bopomofo(text)
323
  text = bopomofo_to_ipa2(text)
324
  text = re.sub(r'i([aoe])', r'j\1', text)
text/symbols.py CHANGED
@@ -1,18 +1,15 @@
1
  '''
2
  Defines the set of symbols used in text input to the model.
3
  '''
4
-
5
- '''# japanese_cleaners
6
  _pad = '_'
7
- _punctuation = ',.!?-'
8
- _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
9
  '''
10
-
11
  # japanese_cleaners2
12
  _pad = '_'
13
  _punctuation = ',.!?-~…'
14
  _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
15
-
16
 
17
  '''# korean_cleaners
18
  _pad = '_'
@@ -26,11 +23,6 @@ _punctuation = ',。!?—…'
26
  _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
27
  '''
28
 
29
- '''# zh_ja_mixture_cleaners
30
- _pad = '_'
31
- _punctuation = ',.!?-~…'
32
- _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
33
- '''
34
 
35
  '''# sanskrit_cleaners
36
  _pad = '_'
@@ -65,7 +57,7 @@ _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
65
  '''# chinese_dialect_cleaners
66
  _pad = '_'
67
  _punctuation = ',.!?~…─'
68
- _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
69
  '''
70
 
71
  # Export all symbols:
 
1
  '''
2
  Defines the set of symbols used in text input to the model.
3
  '''
 
 
4
  _pad = '_'
5
+ _punctuation = ',.!?-~…'
6
+ _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
7
  '''
 
8
  # japanese_cleaners2
9
  _pad = '_'
10
  _punctuation = ',.!?-~…'
11
  _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
12
+ '''
13
 
14
  '''# korean_cleaners
15
  _pad = '_'
 
23
  _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
24
  '''
25
 
 
 
 
 
 
26
 
27
  '''# sanskrit_cleaners
28
  _pad = '_'
 
57
  '''# chinese_dialect_cleaners
58
  _pad = '_'
59
  _punctuation = ',.!?~…─'
60
+ _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚αᴀᴇ↑↓∅ⱼ '
61
  '''
62
 
63
  # Export all symbols: