Spaces:

AkitoP
/

whisper-japanese-phone-demo

Running

App Files Files Community

AkitoP commited on Oct 16, 2024

Commit

0888de7

•

1 Parent(s): 5024e84

transform

Browse files

Files changed (6) hide show

__pycache__/parse_accent.cpython-310.pyc +0 -0
__pycache__/surface2katakana_with_acc.cpython-310.pyc +0 -0
app.py +6 -3
parse_accent.py +62 -0
requirements.txt +2 -1
surface2katakana_with_acc.py +355 -0

__pycache__/parse_accent.cpython-310.pyc ADDED Viewed

Binary file (1.28 kB). View file

__pycache__/surface2katakana_with_acc.cpython-310.pyc ADDED Viewed

Binary file (7.97 kB). View file

app.py CHANGED Viewed

@@ -8,7 +8,8 @@ import librosa
 import spaces
 import torch
 from transformers import pipeline, WhisperConfig
 warnings.filterwarnings("ignore")
 is_hf = os.getenv("SYSTEM") == "spaces"
@@ -29,7 +30,7 @@ pipe = pipeline(
 def transcribe(audio: str) -> str:
     result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
     print(result)
-    return result
 initial_md = """
@@ -43,6 +44,8 @@ with gr.Blocks() as app:
     audio = gr.Audio(type="filepath")
     transcribe_btn = gr.Button("Transcribe")
     output = gr.Textbox(label="Result")
-    transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output])
 app.launch(inbrowser=True)

 import spaces
 import torch
 from transformers import pipeline, WhisperConfig
+from parse_accent import parse_pitch_accent
+from surface2katakana_with_acc import katakana_to_phones
 warnings.filterwarnings("ignore")
 is_hf = os.getenv("SYSTEM") == "spaces"
 def transcribe(audio: str) -> str:
     result = pipe(audio, generate_kwargs=generate_kwargs)["text"]
     print(result)
+    return result, parse_pitch_accent(result), katakana_to_phones(result)
 initial_md = """
     audio = gr.Audio(type="filepath")
     transcribe_btn = gr.Button("Transcribe")
     output = gr.Textbox(label="Result")
+    output_HL_style = gr.Textbox(label="HL Result (SBV2 style)")
+    output_UPDOWN_style = gr.Textbox(label="↑↓ Result (GSV style)")
+    transcribe_btn.click(fn=transcribe,inputs=[audio], outputs=[output, output_HL_style, output_UPDOWN_style])
 app.launch(inbrowser=True)

parse_accent.py ADDED Viewed

	@@ -0,0 +1,62 @@

+def parse_pitch_accent(s):
+    # Remove '^', '#', and '$', keep '_', '?'
+    s = s.replace('^', '').replace('#', '').replace('$', '')
+    marks = []          # List to store the binary marks
+    current_mark = None # Current mark (0 or 1)
+    last_accent = None  # '↑' or '↓' or None
+    prev_char_index = -1 # Index of the previous character (not an accent marker)
+    chars = list(s)     # List of characters from the string
+    i = 0
+    while i < len(chars):
+        char = chars[i]
+        if char == '↑' or char == '↓':
+            if last_accent == char:
+                # Apply special rules for consecutive same accents
+                if char == '↑':
+                    # Mark 0 before the second '↑'
+                    if prev_char_index >= 0:
+                        marks[prev_char_index] = '0'
+                elif char == '↓':
+                    # Mark 1 before the second '↓'
+                    if prev_char_index >= 0:
+                        marks[prev_char_index] = '1'
+            else:
+                # At the start, determine the initial mark based on the first accent
+                if current_mark is None:
+                    current_mark = '0' if char == '↑' else '1'
+            # Set the current mark after the accent
+            current_mark = '1' if char == '↑' else '0'
+            last_accent = char
+        elif char in ['_', '?']:
+            # For '_' and '?', append the current mark
+            marks.append(current_mark)
+            prev_char_index = len(marks) - 1
+        else:
+            # Regular character, append the current mark
+            if current_mark is None:
+                # If no accent encountered yet, look-ahead to determine the starting mark
+                for j in range(i, len(chars)):
+                    if chars[j] == '↑':
+                        current_mark = '0'
+                        break
+                    elif chars[j] == '↓':
+                        current_mark = '1'
+                        break
+            marks.append(current_mark)
+            prev_char_index = len(marks) - 1
+        i += 1
+    # Convert the list of marks to a string
+    result = ''.join(marks)
+    return result
+def katakana_normalize(s):
+    return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
+# Example usage
+# input_str = '^ト↓シコニ#ワ↑タシワ_ホ↓ボ#マ↓イニチ_オ↑ニ↓イソンニ#ナ↑クダシオ#サ↑レテマスシ$'
+# output = parse_pitch_accent(input_str)
+# output_str = katakana_normalize(input_str)
+# print(output_str)
+# assert len(output) == len(output_str)
+# for i in range(len(output)):
+#     print(f"{output_str[i]}: {output[i]}")

requirements.txt CHANGED Viewed

@@ -3,4 +3,5 @@ librosa
 numpy
 spaces
 torch
-transformers

 numpy
 spaces
 torch
+transformers
+pyopenjtalk

surface2katakana_with_acc.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import pyopenjtalk
+import re
+import sys
+import os
+# Temporarily redirect stdout and stderr
+sys.stdout = open(os.devnull, 'w')
+sys.stderr = open(os.devnull, 'w')
+# Call the function that produces the warning
+# e.g., pyopenjtalk.some_function()
+# Restore stdout and stderr
+sys.stdout = sys.__stdout__
+sys.stderr = sys.__stderr__
+# 定义平假名到片假名的转换表
+hiragana_to_katakana = str.maketrans(
+    "ぁあぃいぅうぇえぉおかがきぎくぐけげこご"
+    "さざしじすずせぜそぞただちぢっつづてでとど"
+    "なにぬねのはばぱひびぴふぶぷへべぺほぼぽ"
+    "まみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ",
+    "ァアィイゥウェエォオカガキギクグケゲコゴ"
+    "サザシジスズセゼソゾタダチヂッツヅテデトド"
+    "ナニヌネノハバパヒビピフブプヘベペホボポ"
+    "マミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ"
+)
+# 定义一个函数，将平假名转换为片假名
+def hiragana_to_katakana_func(text):
+    return text.translate(hiragana_to_katakana)
+# 定义一个函数，准确地分割假名为音拍（mora）
+def split_into_moras(kana):
+    # 正则表达式匹配日语音拍，包括拗音、小写片假名和长音符号
+    mora_pattern = re.compile(
+        r"(?:[ァ-ヴー]|[ぁ-ゖ]|ー)[ァィゥェォャュョ]?|ー"
+    )
+    moras = mora_pattern.findall(kana)
+    return moras
+# 定义一个函数，根据 acc 值标注升降调
+def annotate_kana_with_accent(moras, acc):
+    annotated_moras = []
+    for i, mora in enumerate(moras):
+        annotated_moras.append(mora)
+        # 当 acc == 0 时，在第一个假名后添加上升符号
+        if acc == 0 and i == 0:
+            annotated_moras.append('↑')
+        # 当 acc > 1 时，在第一个假名后添加上升符号
+        elif acc > 1 and i == 0:
+            annotated_moras.append('↑')
+        # 当 acc > 0 时，在第 n 个假名后添加下降符号
+        elif acc > 0 and i + 1 == acc:
+            annotated_moras.append('↓')
+    return ''.join(annotated_moras)
+# 主函数，获取带音调符号的片假名序列
+def get_katakana_with_accent(text):
+    current_accent = 0
+    # 对于0形，其结束时current_accent为1，对于其他，其结束时current_accent为0
+    #
+    tokens = pyopenjtalk.run_frontend(text)
+    result = ''
+    for token in tokens:
+        #print(token)
+        mora_size = token['mora_size']
+        if mora_size > 1:
+            pron = token['pron']
+            acc = token['acc']
+            # 将发音转换为平假名
+            kana = pyopenjtalk.g2p(pron, kana=True)
+            # 转换为片假名
+            kana = hiragana_to_katakana_func(kana)
+            # 分割为音拍（mora）
+            moras = split_into_moras(kana)
+            # 标注音调符号
+            annotated_kana = annotate_kana_with_accent(moras, acc)
+            result += annotated_kana
+        elif mora_size == 0 or token['pron'] == '’':
+            # 对于标点符号等，直接添加原始字符串
+            result += token['string']
+        else:
+            result += token['pron']
+    result.replace('’', '↑')
+    return result
+import pyopenjtalk
+import re
+def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
+    """Extract phoneme + prosoody symbol sequence from input full-context labels.
+    The algorithm is based on `Prosodic features control by symbols as input of
+    sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
+    Args:
+        text (str): Input text.
+        drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
+    Returns:
+        List[str]: List of phoneme + prosody symbols.
+    Examples:
+        >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
+        >>> pyopenjtalk_g2p_prosody("こんにちは。")
+        ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
+    .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
+        modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
+    """
+    labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
+    #print(labels)
+    N = len(labels)
+    phones = []
+    for n in range(N):
+        lab_curr = labels[n]
+        # current phoneme
+        p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
+        # deal unvoiced vowels as normal vowels
+        if drop_unvoiced_vowels and p3 in "AEIOU":
+            p3 = p3.lower()
+        # deal with sil at the beginning and the end of text
+        if p3 == "sil":
+            assert n == 0 or n == N - 1
+            if n == 0:
+                phones.append("^")
+            elif n == N - 1:
+                # check question form or not
+                e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
+                if e3 == 0:
+                    phones.append("$")
+                elif e3 == 1:
+                    phones.append("?")
+            continue
+        elif p3 == "pau":
+            phones.append("_")
+            continue
+        else:
+            phones.append(p3)
+        # accent type and position info (forward or backward)
+        a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
+        a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
+        a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
+        # number of mora in accent phrase
+        f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
+        a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
+        # accent phrase border
+        if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
+            phones.append("#")
+        # pitch falling
+        elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
+            phones.append("]")
+        # pitch rising
+        elif a2 == 1 and a2_next == 2:
+            phones.append("[")
+    return phones
+def _numeric_feature_by_regex(regex, s):
+    match = re.search(regex, s)
+    if match is None:
+        return -50
+    return int(match.group(1))
+import pyopenjtalk
+def build_phone_to_katakana():
+    # 所有基本的片假名音节
+    basic_katakana = [
+        'ア', 'イ', 'ウ', 'エ', 'オ',
+        'カ', 'キ', 'ク', 'ケ', 'コ',
+        'サ', 'シ', 'ス', 'セ', 'ソ',
+        'タ', 'チ', 'ツ', 'テ', 'ト',
+        'ナ', 'ニ', 'ヌ', 'ネ', 'ノ',
+        'ハ', 'ヒ', 'フ', 'ヘ', 'ホ',
+        'マ', 'ミ', 'ム', 'メ', 'モ',
+        'ヤ', 'ユ', 'ヨ',
+        'ラ', 'リ', 'ル', 'レ', 'ロ',
+        'ワ', 'ヲ', 'ン',
+        'ガ', 'ギ', 'グ', 'ゲ', 'ゴ',
+        'ザ', 'ジ', 'ズ', 'ゼ', 'ゾ',
+        'ダ', 'ヂ', 'ヅ', 'デ', 'ド',
+        'バ', 'ビ', 'ブ', 'ベ', 'ボ',
+        'パ', 'ピ', 'プ', 'ペ', 'ポ',
+        'キャ', 'キュ', 'キョ',
+        'シャ', 'シュ', 'ショ',
+        'チャ', 'チュ', 'チョ',
+        'ニャ', 'ニュ', 'ニョ',
+        'ヒャ', 'ヒュ', 'ヒョ',
+        'ミャ', 'ミュ', 'ミョ',
+        'リャ', 'リュ', 'リョ',
+        'ギャ', 'ギュ', 'ギョ',
+        'ジャ', 'ジュ', 'ジョ',
+        'ビャ', 'ビュ', 'ビョ',
+        'ピャ', 'ピュ', 'ピョ',
+        'ヴァ', 'ヴィ', 'ヴ', 'ヴェ', 'ヴォ',
+        'ファ', 'フィ', 'フェ', 'フォ',
+        'ウィ', 'ウェ', 'ウォ',
+        'ティ', 'トゥ',
+        'ディ', 'ドゥ',
+        'ツァ', 'ツィ', 'ツェ', 'ツォ',
+        'デュ', 'デョ',
+        'ジェ', 'ジョ',
+        'チェ', 'チョ',
+        'シェ', 'ショ',
+        'ヂェ', 'ヂョ',
+        'ヒェ', 'ヒョ',
+        'ビェ', 'ビョ',
+        'ピェ', 'ピョ',
+        'キェ', 'キョ',
+        'ギェ', 'ギョ',
+        'ミェ', 'ミョ',
+        'リェ', 'リョ',
+        'アァ', 'イィ', 'ウゥ', 'エェ', 'オォ',
+        'ヴャ', 'ヴュ', 'ヴョ',
+        'ッ', 'ー'
+    ]
+    katakana_to_phone = {}
+    for kana in basic_katakana:
+        # 将片假名转换为平假名
+        # hiragana = pyopenjtalk.g2p(kana, kana=True)
+        # 将平假名转换为音素表示
+        phones = pyopenjtalk.g2p(kana)
+        #print(phones)
+        # 去除开头和结尾的静音标记（pau）
+        phones = phones.strip('')
+        # 存储映射关系
+        katakana_to_phone[kana] = phones
+    phone_to_katakana = {}
+    for kana, phones in katakana_to_phone.items():
+        # 检查是否已有相同的音素映射
+        phone_to_katakana[phones] = kana
+    return phone_to_katakana, katakana_to_phone
+# 定义转换函数
+def phones_list_to_katakana(phone_list, phone_to_katakana):
+    output = ''
+    i = 0
+    length = len(phone_list)
+    special_symbols = {'^', '_', '[', ']', '#', '$', '?', '↑', '↓'}
+    while i < length:
+        phone = phone_list[i]
+        if phone in special_symbols:
+            output += phone
+            i += 1
+        else:
+            max_match_length = 5
+            match_found = False
+            for l in range(max_match_length, 0, -1):
+                if i + l <= length:
+                    phones_seq = ' '.join(phone_list[i:i+l])
+                    if phones_seq in phone_to_katakana:
+                        output += phone_to_katakana[phones_seq]
+                        i += l
+                        match_found = True
+                        break
+            if not match_found:
+                single_phone = phone_list[i]
+                if single_phone in phone_to_katakana:
+                    output += phone_to_katakana[single_phone]
+                    i += 1
+                else:
+                    print(f"无法映射的���素: {single_phone}")
+                    i += 1
+    if len(output) == 0:
+        return "…"
+    return output.replace("[", "↑").replace("]", "↓")
+def katakana_to_phones_list(katakana_list, katakana_to_phone):
+    output = []
+    i = 0
+    length = len(katakana_list)
+    special_symbols = {'^', '_', '[', ']', '#', '$', '?', '↑', '↓'}
+    while i < length:
+        katakana = katakana_list[i]
+        if katakana in special_symbols:
+            output.append(katakana)
+            i += 1
+        else:
+            max_match_length = 5
+            match_found = False
+            for l in range(max_match_length, 0, -1):
+                if i + l <= length:
+                    katakana_seq = ''.join(katakana_list[i:i+l])
+                    if katakana_seq in katakana_to_phone:
+                        output.append(katakana_to_phone[katakana_seq])
+                        i += l
+                        match_found = True
+                        break
+            if not match_found:
+                single_katakana = katakana_list[i]
+                if single_katakana in katakana_to_phone:
+                    output.append(katakana_to_phone[single_katakana])
+                    i += 1
+                else:
+                    print(f"无法映射的片假名: {single_katakana}")
+                    i += 1
+    if len(output) == 0:
+        return ["…"]
+    return output
+phone_to_katakana, katakana_to_phone = build_phone_to_katakana()
+def surface_to_katakana_with_accent(text):
+    text = text.replace("…", "")
+    phones = pyopenjtalk_g2p_prosody(text)
+    return phones_list_to_katakana(phones, phone_to_katakana)
+def katakana_to_phones(katakana, katakana_to_phone = katakana_to_phone):
+    katakana_list = list(katakana)
+    phone_list = katakana_to_phones_list(katakana_list, katakana_to_phone)
+    return ' '.join(phone_list).replace("^", "").replace("#", "").replace("$", "").replace("  "," ").strip()
+# 处理文本中的标点符号和空格
+# def preprocess_text(text):
+#     # 定义日语字符的正则表达式
+#     japanese_characters = re.compile(
+#         r"[ぁ-ゟ゠-ヿ一-龯]"
+#     )
+#     # 定义非日语字符（包括标点符号、空格等）的正则表达式
+#     non_japanese_characters = re.compile(
+#         r"[^ぁ-ゟ゠-ヿ一-龯]+"
+#     )
+#     sentences = re.split(non_japanese_characters, text)
+#     marks = re.findall(non_japanese_characters, text)
+#     processed_text = []
+#     for i, sentence in enumerate(sentences):
+#         if sentence:
+#             annotated_sentence = get_katakana_with_accent(sentence)
+#             processed_text.append(annotated_sentence)
+#         if i < len(marks):
+#             mark = marks[i]
+#             if mark.strip():
+#                 processed_text.append(mark)
+#     temp = ''.join(processed_text)
+#     return_text = temp.replace("’", "↑")
+#     return return_text
+def preprocess_text(text):
+    #print(text)
+    return surface_to_katakana_with_accent(text)
+# 示例用法
+if __name__ == "__main__":
+    text = "^キョ↓オワ#ワ↑タシノ#マ↑ホオ#エ↑ネル↓キイノ#ホ↑キュウノ#タ↑メ↓ギ$"
+    annotated_text = katakana_to_phones(text)
+    print(annotated_text)