transform
Browse files- __pycache__/parse_accent.cpython-310.pyc +0 -0
- __pycache__/surface2katakana_with_acc.cpython-310.pyc +0 -0
- app.py +6 -3
- parse_accent.py +62 -0
- requirements.txt +2 -1
- surface2katakana_with_acc.py +355 -0
__pycache__/parse_accent.cpython-310.pyc
ADDED
Binary file (1.28 kB). View file
|
|
__pycache__/surface2katakana_with_acc.cpython-310.pyc
ADDED
Binary file (7.97 kB). View file
|
|
app.py
CHANGED
@@ -8,7 +8,8 @@ import librosa
|
|
8 |
import spaces
|
9 |
import torch
|
10 |
from transformers import pipeline, WhisperConfig
|
11 |
-
|
|
|
12 |
warnings.filterwarnings("ignore")
|
13 |
|
14 |
is_hf = os.getenv("SYSTEM") == "spaces"
|
@@ -29,7 +30,7 @@ pipe = pipeline(
|
|
29 |
def transcribe(audio: str) -> str:
|
30 |
result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
|
31 |
print(result)
|
32 |
-
return result
|
33 |
|
34 |
|
35 |
initial_md = """
|
@@ -43,6 +44,8 @@ with gr.Blocks() as app:
|
|
43 |
audio = gr.Audio(type="filepath")
|
44 |
transcribe_btn = gr.Button("Transcribe")
|
45 |
output = gr.Textbox(label="Result")
|
46 |
-
|
|
|
|
|
47 |
|
48 |
app.launch(inbrowser=True)
|
|
|
8 |
import spaces
|
9 |
import torch
|
10 |
from transformers import pipeline, WhisperConfig
|
11 |
+
from parse_accent import parse_pitch_accent
|
12 |
+
from surface2katakana_with_acc import katakana_to_phones
|
13 |
warnings.filterwarnings("ignore")
|
14 |
|
15 |
is_hf = os.getenv("SYSTEM") == "spaces"
|
|
|
30 |
def transcribe(audio: str) -> str:
|
31 |
result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
|
32 |
print(result)
|
33 |
+
return result, parse_pitch_accent(result), katakana_to_phones(result)
|
34 |
|
35 |
|
36 |
initial_md = """
|
|
|
44 |
audio = gr.Audio(type="filepath")
|
45 |
transcribe_btn = gr.Button("Transcribe")
|
46 |
output = gr.Textbox(label="Result")
|
47 |
+
output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
|
48 |
+
output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)")
|
49 |
+
transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])
|
50 |
|
51 |
app.launch(inbrowser=True)
|
parse_accent.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def parse_pitch_accent(s):
|
2 |
+
# Remove '^', '#', and '$', keep '_', '?'
|
3 |
+
s = s.replace('^', '').replace('#', '').replace('$', '')
|
4 |
+
|
5 |
+
marks = [] # List to store the binary marks
|
6 |
+
current_mark = None # Current mark (0 or 1)
|
7 |
+
last_accent = None # '↑' or '↓' or None
|
8 |
+
prev_char_index = -1 # Index of the previous character (not an accent marker)
|
9 |
+
chars = list(s) # List of characters from the string
|
10 |
+
|
11 |
+
i = 0
|
12 |
+
while i < len(chars):
|
13 |
+
char = chars[i]
|
14 |
+
if char == '↑' or char == '↓':
|
15 |
+
if last_accent == char:
|
16 |
+
# Apply special rules for consecutive same accents
|
17 |
+
if char == '↑':
|
18 |
+
# Mark 0 before the second '↑'
|
19 |
+
if prev_char_index >= 0:
|
20 |
+
marks[prev_char_index] = '0'
|
21 |
+
elif char == '↓':
|
22 |
+
# Mark 1 before the second '↓'
|
23 |
+
if prev_char_index >= 0:
|
24 |
+
marks[prev_char_index] = '1'
|
25 |
+
else:
|
26 |
+
# At the start, determine the initial mark based on the first accent
|
27 |
+
if current_mark is None:
|
28 |
+
current_mark = '0' if char == '↑' else '1'
|
29 |
+
# Set the current mark after the accent
|
30 |
+
current_mark = '1' if char == '↑' else '0'
|
31 |
+
last_accent = char
|
32 |
+
elif char in ['_', '?']:
|
33 |
+
# For '_' and '?', append the current mark
|
34 |
+
marks.append(current_mark)
|
35 |
+
prev_char_index = len(marks) - 1
|
36 |
+
else:
|
37 |
+
# Regular character, append the current mark
|
38 |
+
if current_mark is None:
|
39 |
+
# If no accent encountered yet, look-ahead to determine the starting mark
|
40 |
+
for j in range(i, len(chars)):
|
41 |
+
if chars[j] == '↑':
|
42 |
+
current_mark = '0'
|
43 |
+
break
|
44 |
+
elif chars[j] == '↓':
|
45 |
+
current_mark = '1'
|
46 |
+
break
|
47 |
+
marks.append(current_mark)
|
48 |
+
prev_char_index = len(marks) - 1
|
49 |
+
i += 1
|
50 |
+
# Convert the list of marks to a string
|
51 |
+
result = ''.join(marks)
|
52 |
+
return result
|
53 |
+
def katakana_normalize(s):
|
54 |
+
return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
|
55 |
+
# Example usage
|
56 |
+
# input_str = '^ト↓シコニ#ワ↑タシワ_ホ↓ボ#マ↓イニチ_オ↑ニ↓イソンニ#ナ↑クダシオ#サ↑レテマスシ$'
|
57 |
+
# output = parse_pitch_accent(input_str)
|
58 |
+
# output_str = katakana_normalize(input_str)
|
59 |
+
# print(output_str)
|
60 |
+
# assert len(output) == len(output_str)
|
61 |
+
# for i in range(len(output)):
|
62 |
+
# print(f"{output_str[i]}: {output[i]}")
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ librosa
|
|
3 |
numpy
|
4 |
spaces
|
5 |
torch
|
6 |
-
transformers
|
|
|
|
3 |
numpy
|
4 |
spaces
|
5 |
torch
|
6 |
+
transformers
|
7 |
+
pyopenjtalk
|
surface2katakana_with_acc.py
ADDED
@@ -0,0 +1,355 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pyopenjtalk
|
2 |
+
import re
|
3 |
+
import sys
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Temporarily redirect stdout and stderr
|
7 |
+
sys.stdout = open(os.devnull, 'w')
|
8 |
+
sys.stderr = open(os.devnull, 'w')
|
9 |
+
|
10 |
+
# Call the function that produces the warning
|
11 |
+
# e.g., pyopenjtalk.some_function()
|
12 |
+
|
13 |
+
# Restore stdout and stderr
|
14 |
+
sys.stdout = sys.__stdout__
|
15 |
+
sys.stderr = sys.__stderr__
|
16 |
+
|
17 |
+
# 定义平假名到片假名的转换表
|
18 |
+
hiragana_to_katakana = str.maketrans(
|
19 |
+
"ぁあぃいぅうぇえぉおかがきぎくぐけげこご"
|
20 |
+
"さざしじすずせぜそぞただちぢっつづてでとど"
|
21 |
+
"なにぬねのはばぱひびぴふぶぷへべぺほぼぽ"
|
22 |
+
"まみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ",
|
23 |
+
"ァアィイゥウェエォオカガキギクグケゲコゴ"
|
24 |
+
"サザシジスズセゼソゾタダチヂッツヅテデトド"
|
25 |
+
"ナニヌネノハバパヒビピフブプヘベペホボポ"
|
26 |
+
"マミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
|
27 |
+
)
|
28 |
+
|
29 |
+
# 定义一个函数,将平假名转换为片假名
|
30 |
+
def hiragana_to_katakana_func(text):
|
31 |
+
return text.translate(hiragana_to_katakana)
|
32 |
+
|
33 |
+
# 定义一个函数,准确地分割假名为音拍(mora)
|
34 |
+
def split_into_moras(kana):
|
35 |
+
# 正则表达式匹配日语音拍,包括拗音、小写片假名和长音符号
|
36 |
+
mora_pattern = re.compile(
|
37 |
+
r"(?:[ァ-ヴー]|[ぁ-ゖ]|ー)[ァィゥェォャュョ]?|ー"
|
38 |
+
)
|
39 |
+
moras = mora_pattern.findall(kana)
|
40 |
+
return moras
|
41 |
+
|
42 |
+
# 定义一个函数,根据 acc 值标注升降调
|
43 |
+
def annotate_kana_with_accent(moras, acc):
|
44 |
+
annotated_moras = []
|
45 |
+
for i, mora in enumerate(moras):
|
46 |
+
annotated_moras.append(mora)
|
47 |
+
# 当 acc == 0 时,在第一个假名后添加上升符号
|
48 |
+
if acc == 0 and i == 0:
|
49 |
+
annotated_moras.append('↑')
|
50 |
+
# 当 acc > 1 时,在第一个假名后添加上升符号
|
51 |
+
elif acc > 1 and i == 0:
|
52 |
+
annotated_moras.append('↑')
|
53 |
+
# 当 acc > 0 时,在第 n 个假名后添加下降符号
|
54 |
+
elif acc > 0 and i + 1 == acc:
|
55 |
+
annotated_moras.append('↓')
|
56 |
+
return ''.join(annotated_moras)
|
57 |
+
|
58 |
+
# 主函数,获取带音调符号的片假名序列
|
59 |
+
def get_katakana_with_accent(text):
|
60 |
+
current_accent = 0
|
61 |
+
# 对于0形,其结束时current_accent为1,对于其他,其结束时current_accent为0
|
62 |
+
#
|
63 |
+
tokens = pyopenjtalk.run_frontend(text)
|
64 |
+
result = ''
|
65 |
+
for token in tokens:
|
66 |
+
#print(token)
|
67 |
+
mora_size = token['mora_size']
|
68 |
+
if mora_size > 1:
|
69 |
+
pron = token['pron']
|
70 |
+
acc = token['acc']
|
71 |
+
# 将发音转换为平假名
|
72 |
+
kana = pyopenjtalk.g2p(pron, kana=True)
|
73 |
+
# 转换为片假名
|
74 |
+
kana = hiragana_to_katakana_func(kana)
|
75 |
+
# 分割为音拍(mora)
|
76 |
+
moras = split_into_moras(kana)
|
77 |
+
# 标注音调符号
|
78 |
+
annotated_kana = annotate_kana_with_accent(moras, acc)
|
79 |
+
result += annotated_kana
|
80 |
+
elif mora_size == 0 or token['pron'] == '’':
|
81 |
+
# 对于标点符号等,直接添加原始字符串
|
82 |
+
result += token['string']
|
83 |
+
else:
|
84 |
+
result += token['pron']
|
85 |
+
result.replace('’', '↑')
|
86 |
+
return result
|
87 |
+
|
88 |
+
import pyopenjtalk
|
89 |
+
import re
|
90 |
+
def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
|
91 |
+
"""Extract phoneme + prosoody symbol sequence from input full-context labels.
|
92 |
+
|
93 |
+
The algorithm is based on `Prosodic features control by symbols as input of
|
94 |
+
sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
text (str): Input text.
|
98 |
+
drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
|
99 |
+
|
100 |
+
Returns:
|
101 |
+
List[str]: List of phoneme + prosody symbols.
|
102 |
+
|
103 |
+
Examples:
|
104 |
+
>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
|
105 |
+
>>> pyopenjtalk_g2p_prosody("こんにちは。")
|
106 |
+
['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
|
107 |
+
|
108 |
+
.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
|
109 |
+
modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
|
110 |
+
|
111 |
+
"""
|
112 |
+
labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
|
113 |
+
#print(labels)
|
114 |
+
N = len(labels)
|
115 |
+
|
116 |
+
phones = []
|
117 |
+
for n in range(N):
|
118 |
+
lab_curr = labels[n]
|
119 |
+
|
120 |
+
# current phoneme
|
121 |
+
p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
|
122 |
+
# deal unvoiced vowels as normal vowels
|
123 |
+
if drop_unvoiced_vowels and p3 in "AEIOU":
|
124 |
+
p3 = p3.lower()
|
125 |
+
|
126 |
+
# deal with sil at the beginning and the end of text
|
127 |
+
if p3 == "sil":
|
128 |
+
assert n == 0 or n == N - 1
|
129 |
+
if n == 0:
|
130 |
+
phones.append("^")
|
131 |
+
elif n == N - 1:
|
132 |
+
# check question form or not
|
133 |
+
e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
|
134 |
+
if e3 == 0:
|
135 |
+
phones.append("$")
|
136 |
+
elif e3 == 1:
|
137 |
+
phones.append("?")
|
138 |
+
continue
|
139 |
+
elif p3 == "pau":
|
140 |
+
phones.append("_")
|
141 |
+
continue
|
142 |
+
else:
|
143 |
+
phones.append(p3)
|
144 |
+
|
145 |
+
# accent type and position info (forward or backward)
|
146 |
+
a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
|
147 |
+
a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
|
148 |
+
a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
|
149 |
+
|
150 |
+
# number of mora in accent phrase
|
151 |
+
f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
|
152 |
+
|
153 |
+
a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
|
154 |
+
# accent phrase border
|
155 |
+
if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
|
156 |
+
phones.append("#")
|
157 |
+
# pitch falling
|
158 |
+
elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
|
159 |
+
phones.append("]")
|
160 |
+
# pitch rising
|
161 |
+
elif a2 == 1 and a2_next == 2:
|
162 |
+
phones.append("[")
|
163 |
+
|
164 |
+
return phones
|
165 |
+
|
166 |
+
def _numeric_feature_by_regex(regex, s):
|
167 |
+
match = re.search(regex, s)
|
168 |
+
if match is None:
|
169 |
+
return -50
|
170 |
+
return int(match.group(1))
|
171 |
+
import pyopenjtalk
|
172 |
+
def build_phone_to_katakana():
|
173 |
+
# 所有基本的片假名音节
|
174 |
+
basic_katakana = [
|
175 |
+
'ア', 'イ', 'ウ', 'エ', 'オ',
|
176 |
+
'カ', 'キ', 'ク', 'ケ', 'コ',
|
177 |
+
'サ', 'シ', 'ス', 'セ', 'ソ',
|
178 |
+
'タ', 'チ', 'ツ', 'テ', 'ト',
|
179 |
+
'ナ', 'ニ', 'ヌ', 'ネ', 'ノ',
|
180 |
+
'ハ', 'ヒ', 'フ', 'ヘ', 'ホ',
|
181 |
+
'マ', 'ミ', 'ム', 'メ', 'モ',
|
182 |
+
'ヤ', 'ユ', 'ヨ',
|
183 |
+
'ラ', 'リ', 'ル', 'レ', 'ロ',
|
184 |
+
'ワ', 'ヲ', 'ン',
|
185 |
+
'ガ', 'ギ', 'グ', 'ゲ', 'ゴ',
|
186 |
+
'ザ', 'ジ', 'ズ', 'ゼ', 'ゾ',
|
187 |
+
'ダ', 'ヂ', 'ヅ', 'デ', 'ド',
|
188 |
+
'バ', 'ビ', 'ブ', 'ベ', 'ボ',
|
189 |
+
'パ', 'ピ', 'プ', 'ペ', 'ポ',
|
190 |
+
'キャ', 'キュ', 'キョ',
|
191 |
+
'シャ', 'シュ', 'ショ',
|
192 |
+
'チャ', 'チュ', 'チョ',
|
193 |
+
'ニャ', 'ニュ', 'ニョ',
|
194 |
+
'ヒャ', 'ヒュ', 'ヒョ',
|
195 |
+
'ミャ', 'ミュ', 'ミョ',
|
196 |
+
'リャ', 'リュ', 'リョ',
|
197 |
+
'ギャ', 'ギュ', 'ギョ',
|
198 |
+
'ジャ', 'ジュ', 'ジョ',
|
199 |
+
'ビャ', 'ビュ', 'ビョ',
|
200 |
+
'ピャ', 'ピュ', 'ピョ',
|
201 |
+
'ヴァ', 'ヴィ', 'ヴ', 'ヴェ', 'ヴォ',
|
202 |
+
'ファ', 'フィ', 'フェ', 'フォ',
|
203 |
+
'ウィ', 'ウェ', 'ウォ',
|
204 |
+
'ティ', 'トゥ',
|
205 |
+
'ディ', 'ドゥ',
|
206 |
+
'ツァ', 'ツィ', 'ツェ', 'ツォ',
|
207 |
+
'デュ', 'デョ',
|
208 |
+
'ジェ', 'ジョ',
|
209 |
+
'チェ', 'チョ',
|
210 |
+
'シェ', 'ショ',
|
211 |
+
'ヂェ', 'ヂョ',
|
212 |
+
'ヒェ', 'ヒョ',
|
213 |
+
'ビェ', 'ビョ',
|
214 |
+
'ピェ', 'ピョ',
|
215 |
+
'キェ', 'キョ',
|
216 |
+
'ギェ', 'ギョ',
|
217 |
+
'ミェ', 'ミョ',
|
218 |
+
'リェ', 'リョ',
|
219 |
+
'アァ', 'イィ', 'ウゥ', 'エェ', 'オォ',
|
220 |
+
'ヴャ', 'ヴュ', 'ヴョ',
|
221 |
+
'ッ', 'ー'
|
222 |
+
]
|
223 |
+
|
224 |
+
|
225 |
+
katakana_to_phone = {}
|
226 |
+
|
227 |
+
for kana in basic_katakana:
|
228 |
+
# 将片假名转换为平假名
|
229 |
+
# hiragana = pyopenjtalk.g2p(kana, kana=True)
|
230 |
+
# 将平假名转换为音素表示
|
231 |
+
phones = pyopenjtalk.g2p(kana)
|
232 |
+
#print(phones)
|
233 |
+
# 去除开头和结尾的静音标记(pau)
|
234 |
+
phones = phones.strip('')
|
235 |
+
# 存储映射关系
|
236 |
+
katakana_to_phone[kana] = phones
|
237 |
+
|
238 |
+
phone_to_katakana = {}
|
239 |
+
|
240 |
+
for kana, phones in katakana_to_phone.items():
|
241 |
+
# 检查是否已有相同的音素映射
|
242 |
+
phone_to_katakana[phones] = kana
|
243 |
+
return phone_to_katakana, katakana_to_phone
|
244 |
+
# 定义转换函数
|
245 |
+
def phones_list_to_katakana(phone_list, phone_to_katakana):
|
246 |
+
output = ''
|
247 |
+
i = 0
|
248 |
+
length = len(phone_list)
|
249 |
+
special_symbols = {'^', '_', '[', ']', '#', '$', '?', '↑', '↓'}
|
250 |
+
|
251 |
+
while i < length:
|
252 |
+
phone = phone_list[i]
|
253 |
+
if phone in special_symbols:
|
254 |
+
output += phone
|
255 |
+
i += 1
|
256 |
+
else:
|
257 |
+
max_match_length = 5
|
258 |
+
match_found = False
|
259 |
+
for l in range(max_match_length, 0, -1):
|
260 |
+
if i + l <= length:
|
261 |
+
phones_seq = ' '.join(phone_list[i:i+l])
|
262 |
+
if phones_seq in phone_to_katakana:
|
263 |
+
output += phone_to_katakana[phones_seq]
|
264 |
+
i += l
|
265 |
+
match_found = True
|
266 |
+
break
|
267 |
+
if not match_found:
|
268 |
+
single_phone = phone_list[i]
|
269 |
+
if single_phone in phone_to_katakana:
|
270 |
+
output += phone_to_katakana[single_phone]
|
271 |
+
i += 1
|
272 |
+
else:
|
273 |
+
print(f"无法映射的���素: {single_phone}")
|
274 |
+
i += 1
|
275 |
+
if len(output) == 0:
|
276 |
+
return "…"
|
277 |
+
return output.replace("[", "↑").replace("]", "↓")
|
278 |
+
def katakana_to_phones_list(katakana_list, katakana_to_phone):
|
279 |
+
output = []
|
280 |
+
i = 0
|
281 |
+
length = len(katakana_list)
|
282 |
+
special_symbols = {'^', '_', '[', ']', '#', '$', '?', '↑', '↓'}
|
283 |
+
|
284 |
+
while i < length:
|
285 |
+
katakana = katakana_list[i]
|
286 |
+
if katakana in special_symbols:
|
287 |
+
output.append(katakana)
|
288 |
+
i += 1
|
289 |
+
else:
|
290 |
+
max_match_length = 5
|
291 |
+
match_found = False
|
292 |
+
for l in range(max_match_length, 0, -1):
|
293 |
+
if i + l <= length:
|
294 |
+
katakana_seq = ''.join(katakana_list[i:i+l])
|
295 |
+
if katakana_seq in katakana_to_phone:
|
296 |
+
output.append(katakana_to_phone[katakana_seq])
|
297 |
+
i += l
|
298 |
+
match_found = True
|
299 |
+
break
|
300 |
+
if not match_found:
|
301 |
+
single_katakana = katakana_list[i]
|
302 |
+
if single_katakana in katakana_to_phone:
|
303 |
+
output.append(katakana_to_phone[single_katakana])
|
304 |
+
i += 1
|
305 |
+
else:
|
306 |
+
print(f"无法映射的片假名: {single_katakana}")
|
307 |
+
i += 1
|
308 |
+
if len(output) == 0:
|
309 |
+
return ["…"]
|
310 |
+
return output
|
311 |
+
|
312 |
+
phone_to_katakana, katakana_to_phone = build_phone_to_katakana()
|
313 |
+
|
314 |
+
def surface_to_katakana_with_accent(text):
|
315 |
+
text = text.replace("…", "")
|
316 |
+
phones = pyopenjtalk_g2p_prosody(text)
|
317 |
+
return phones_list_to_katakana(phones, phone_to_katakana)
|
318 |
+
|
319 |
+
def katakana_to_phones(katakana, katakana_to_phone = katakana_to_phone):
|
320 |
+
katakana_list = list(katakana)
|
321 |
+
phone_list = katakana_to_phones_list(katakana_list, katakana_to_phone)
|
322 |
+
return ' '.join(phone_list).replace("^", "").replace("#", "").replace("$", "").replace(" "," ").strip()
|
323 |
+
|
324 |
+
# 处理文本中的标点符号和空格
|
325 |
+
# def preprocess_text(text):
|
326 |
+
# # 定义日语字符的正则表达式
|
327 |
+
# japanese_characters = re.compile(
|
328 |
+
# r"[ぁ-ゟ゠-ヿ一-龯]"
|
329 |
+
# )
|
330 |
+
# # 定义非日语字符(包括标点符号、空格等)的正则表达式
|
331 |
+
# non_japanese_characters = re.compile(
|
332 |
+
# r"[^ぁ-ゟ゠-ヿ一-龯]+"
|
333 |
+
# )
|
334 |
+
# sentences = re.split(non_japanese_characters, text)
|
335 |
+
# marks = re.findall(non_japanese_characters, text)
|
336 |
+
# processed_text = []
|
337 |
+
# for i, sentence in enumerate(sentences):
|
338 |
+
# if sentence:
|
339 |
+
# annotated_sentence = get_katakana_with_accent(sentence)
|
340 |
+
# processed_text.append(annotated_sentence)
|
341 |
+
# if i < len(marks):
|
342 |
+
# mark = marks[i]
|
343 |
+
# if mark.strip():
|
344 |
+
# processed_text.append(mark)
|
345 |
+
# temp = ''.join(processed_text)
|
346 |
+
# return_text = temp.replace("’", "↑")
|
347 |
+
# return return_text
|
348 |
+
def preprocess_text(text):
|
349 |
+
#print(text)
|
350 |
+
return surface_to_katakana_with_accent(text)
|
351 |
+
# 示例用法
|
352 |
+
if __name__ == "__main__":
|
353 |
+
text = "^キョ↓オワ#ワ↑タシノ#マ↑ホオ#エ↑ネル↓キイノ#ホ↑キュウノ#タ↑メ↓ギ$"
|
354 |
+
annotated_text = katakana_to_phones(text)
|
355 |
+
print(annotated_text)
|