Mahiruoshi
commited on
Commit
•
361bf5f
1
Parent(s):
d67d0bf
Upload 11 files
Browse files- text/__init__.py +3 -27
- text/__pycache__/__init__.cpython-39.pyc +0 -0
- text/__pycache__/cleaners.cpython-39.pyc +0 -0
- text/__pycache__/japanese.cpython-39.pyc +0 -0
- text/__pycache__/mandarin.cpython-39.pyc +0 -0
- text/cleaners.py +57 -14
- text/japanese.py +1 -13
text/__init__.py
CHANGED
@@ -1,14 +1,8 @@
|
|
1 |
""" from https://github.com/keithito/tacotron """
|
2 |
from text import cleaners
|
3 |
-
from text.symbols import symbols
|
4 |
|
5 |
|
6 |
-
|
7 |
-
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
8 |
-
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
9 |
-
|
10 |
-
|
11 |
-
def text_to_sequence(text, cleaner_names):
|
12 |
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
13 |
Args:
|
14 |
text: string to convert to a sequence
|
@@ -16,6 +10,8 @@ def text_to_sequence(text, cleaner_names):
|
|
16 |
Returns:
|
17 |
List of integers corresponding to the symbols in the text
|
18 |
'''
|
|
|
|
|
19 |
sequence = []
|
20 |
|
21 |
clean_text = _clean_text(text, cleaner_names)
|
@@ -27,26 +23,6 @@ def text_to_sequence(text, cleaner_names):
|
|
27 |
return sequence
|
28 |
|
29 |
|
30 |
-
def cleaned_text_to_sequence(cleaned_text):
|
31 |
-
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
32 |
-
Args:
|
33 |
-
text: string to convert to a sequence
|
34 |
-
Returns:
|
35 |
-
List of integers corresponding to the symbols in the text
|
36 |
-
'''
|
37 |
-
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text if symbol in _symbol_to_id.keys()]
|
38 |
-
return sequence
|
39 |
-
|
40 |
-
|
41 |
-
def sequence_to_text(sequence):
|
42 |
-
'''Converts a sequence of IDs back to a string'''
|
43 |
-
result = ''
|
44 |
-
for symbol_id in sequence:
|
45 |
-
s = _id_to_symbol[symbol_id]
|
46 |
-
result += s
|
47 |
-
return result
|
48 |
-
|
49 |
-
|
50 |
def _clean_text(text, cleaner_names):
|
51 |
for name in cleaner_names:
|
52 |
cleaner = getattr(cleaners, name)
|
|
|
1 |
""" from https://github.com/keithito/tacotron """
|
2 |
from text import cleaners
|
|
|
3 |
|
4 |
|
5 |
+
def text_to_sequence(text, symbols, cleaner_names):
|
|
|
|
|
|
|
|
|
|
|
6 |
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
7 |
Args:
|
8 |
text: string to convert to a sequence
|
|
|
10 |
Returns:
|
11 |
List of integers corresponding to the symbols in the text
|
12 |
'''
|
13 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
14 |
+
|
15 |
sequence = []
|
16 |
|
17 |
clean_text = _clean_text(text, cleaner_names)
|
|
|
23 |
return sequence
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def _clean_text(text, cleaner_names):
|
27 |
for name in cleaner_names:
|
28 |
cleaner = getattr(cleaners, name)
|
text/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (1.23 kB). View file
|
|
text/__pycache__/cleaners.cpython-39.pyc
ADDED
Binary file (3.74 kB). View file
|
|
text/__pycache__/japanese.cpython-39.pyc
ADDED
Binary file (4.43 kB). View file
|
|
text/__pycache__/mandarin.cpython-39.pyc
ADDED
Binary file (6.4 kB). View file
|
|
text/cleaners.py
CHANGED
@@ -1,21 +1,33 @@
|
|
1 |
import re
|
2 |
-
from text.
|
3 |
-
from text.japanese import clean_japanese, japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
|
4 |
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
5 |
|
6 |
-
def none_cleaner(text):
|
7 |
-
return text
|
8 |
-
|
9 |
def japanese_cleaners(text):
|
10 |
-
text
|
11 |
-
text =
|
|
|
|
|
12 |
return text
|
13 |
|
|
|
14 |
def japanese_cleaners2(text):
|
15 |
return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def chinese_cleaners(text):
|
18 |
'''Pipeline for Chinese text'''
|
|
|
19 |
text = number_to_chinese(text)
|
20 |
text = chinese_to_bopomofo(text)
|
21 |
text = latin_to_bopomofo(text)
|
@@ -23,7 +35,10 @@ def chinese_cleaners(text):
|
|
23 |
text += '。'
|
24 |
return text
|
25 |
|
|
|
26 |
def zh_ja_mixture_cleaners(text):
|
|
|
|
|
27 |
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
|
28 |
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
|
29 |
for chinese_text in chinese_texts:
|
@@ -38,25 +53,53 @@ def zh_ja_mixture_cleaners(text):
|
|
38 |
text += '.'
|
39 |
return text
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def cjke_cleaners(text):
|
42 |
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
|
43 |
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
|
44 |
-
english_texts = re.findall(r'\[EN\].*?\[EN\]', text)
|
45 |
for chinese_text in chinese_texts:
|
46 |
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
|
47 |
cleaned_text = cleaned_text.replace(
|
48 |
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
|
49 |
text = text.replace(chinese_text, cleaned_text+' ', 1)
|
50 |
for japanese_text in japanese_texts:
|
51 |
-
cleaned_text =
|
52 |
cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
|
53 |
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
|
54 |
text = text.replace(japanese_text, cleaned_text+' ', 1)
|
55 |
-
for english_text in english_texts:
|
56 |
-
cleaned_text = english_to_ipa2(english_text[4:-4])
|
57 |
-
cleaned_text = cleaned_text.replace('ɑ', 'a').replace(
|
58 |
-
'ɔ', 'o').replace('ɛ', 'e').replace('ɪ', 'i').replace('ʊ', 'u')
|
59 |
-
text = text.replace(english_text, cleaned_text+' ', 1)
|
60 |
text = text[:-1]
|
61 |
if re.match(r'[^\.,!\?\-…~]', text[-1]):
|
62 |
text += '.'
|
|
|
1 |
import re
|
2 |
+
from text.japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
|
|
|
3 |
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
4 |
|
|
|
|
|
|
|
5 |
def japanese_cleaners(text):
|
6 |
+
from text.japanese import japanese_to_romaji_with_accent
|
7 |
+
text = japanese_to_romaji_with_accent(text)
|
8 |
+
if re.match('[A-Za-z]', text[-1]):
|
9 |
+
text += '.'
|
10 |
return text
|
11 |
|
12 |
+
|
13 |
def japanese_cleaners2(text):
|
14 |
return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')
|
15 |
|
16 |
+
|
17 |
+
def korean_cleaners(text):
|
18 |
+
'''Pipeline for Korean text'''
|
19 |
+
from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
|
20 |
+
text = latin_to_hangul(text)
|
21 |
+
text = number_to_hangul(text)
|
22 |
+
text = divide_hangul(text)
|
23 |
+
if re.match('[\u3131-\u3163]', text[-1]):
|
24 |
+
text += '.'
|
25 |
+
return text
|
26 |
+
|
27 |
+
|
28 |
def chinese_cleaners(text):
|
29 |
'''Pipeline for Chinese text'''
|
30 |
+
from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
|
31 |
text = number_to_chinese(text)
|
32 |
text = chinese_to_bopomofo(text)
|
33 |
text = latin_to_bopomofo(text)
|
|
|
35 |
text += '。'
|
36 |
return text
|
37 |
|
38 |
+
|
39 |
def zh_ja_mixture_cleaners(text):
|
40 |
+
from text.mandarin import chinese_to_romaji
|
41 |
+
from text.japanese import japanese_to_romaji_with_accent
|
42 |
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
|
43 |
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
|
44 |
for chinese_text in chinese_texts:
|
|
|
53 |
text += '.'
|
54 |
return text
|
55 |
|
56 |
+
|
57 |
+
def sanskrit_cleaners(text):
|
58 |
+
text = text.replace('॥', '।').replace('ॐ', 'ओम्')
|
59 |
+
if text[-1] != '।':
|
60 |
+
text += ' ।'
|
61 |
+
return text
|
62 |
+
|
63 |
+
|
64 |
+
def cjks_cleaners(text):
|
65 |
+
from text.mandarin import chinese_to_lazy_ipa
|
66 |
+
from text.japanese import japanese_to_ipa
|
67 |
+
from text.korean import korean_to_lazy_ipa
|
68 |
+
from text.sanskrit import devanagari_to_ipa
|
69 |
+
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
|
70 |
+
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
|
71 |
+
korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
|
72 |
+
sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
|
73 |
+
for chinese_text in chinese_texts:
|
74 |
+
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
|
75 |
+
text = text.replace(chinese_text, cleaned_text+' ', 1)
|
76 |
+
for japanese_text in japanese_texts:
|
77 |
+
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
|
78 |
+
text = text.replace(japanese_text, cleaned_text+' ', 1)
|
79 |
+
for korean_text in korean_texts:
|
80 |
+
cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
|
81 |
+
text = text.replace(korean_text, cleaned_text+' ', 1)
|
82 |
+
for sanskrit_text in sanskrit_texts:
|
83 |
+
cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
|
84 |
+
text = text.replace(sanskrit_text, cleaned_text+' ', 1)
|
85 |
+
text = text[:-1]
|
86 |
+
if re.match(r'[^\.,!\?\-…~]', text[-1]):
|
87 |
+
text += '.'
|
88 |
+
return text
|
89 |
+
|
90 |
def cjke_cleaners(text):
|
91 |
chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
|
92 |
japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
|
|
|
93 |
for chinese_text in chinese_texts:
|
94 |
cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
|
95 |
cleaned_text = cleaned_text.replace(
|
96 |
'ʧ', 'tʃ').replace('ʦ', 'ts').replace('ɥan', 'ɥæn')
|
97 |
text = text.replace(chinese_text, cleaned_text+' ', 1)
|
98 |
for japanese_text in japanese_texts:
|
99 |
+
cleaned_text = japanese_to_ipa(japanese_text[4:-4])
|
100 |
cleaned_text = cleaned_text.replace('ʧ', 'tʃ').replace(
|
101 |
'ʦ', 'ts').replace('ɥan', 'ɥæn').replace('ʥ', 'dz')
|
102 |
text = text.replace(japanese_text, cleaned_text+' ', 1)
|
|
|
|
|
|
|
|
|
|
|
103 |
text = text[:-1]
|
104 |
if re.match(r'[^\.,!\?\-…~]', text[-1]):
|
105 |
text += '.'
|
text/japanese.py
CHANGED
@@ -1,18 +1,6 @@
|
|
1 |
import re
|
2 |
from unidecode import unidecode
|
3 |
-
|
4 |
-
import ctypes
|
5 |
-
|
6 |
-
dll = ctypes.cdll.LoadLibrary('cleaners/JapaneseCleaner.dll')
|
7 |
-
dll.CreateOjt.restype = ctypes.c_uint64
|
8 |
-
dll.PluginMain.restype = ctypes.c_uint64
|
9 |
-
floder = ctypes.create_unicode_buffer("cleaners")
|
10 |
-
dll.CreateOjt(floder)
|
11 |
-
|
12 |
-
def clean_japanese(text):
|
13 |
-
input_wchar_pointer = ctypes.create_unicode_buffer(text)
|
14 |
-
result = ctypes.wstring_at(dll.PluginMain(input_wchar_pointer))
|
15 |
-
return result
|
16 |
|
17 |
|
18 |
# Regular expression matching Japanese without punctuation marks:
|
|
|
1 |
import re
|
2 |
from unidecode import unidecode
|
3 |
+
import pyopenjtalk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
# Regular expression matching Japanese without punctuation marks:
|