whisper-japanese-phone-demo / parse_accent.py
AkitoP's picture
transform
0888de7
raw
history blame
2.77 kB
def parse_pitch_accent(s):
# Remove '^', '#', and '$', keep '_', '?'
s = s.replace('^', '').replace('#', '').replace('$', '')
marks = [] # List to store the binary marks
current_mark = None # Current mark (0 or 1)
last_accent = None # '↑' or '↓' or None
prev_char_index = -1 # Index of the previous character (not an accent marker)
chars = list(s) # List of characters from the string
i = 0
while i < len(chars):
char = chars[i]
if char == '↑' or char == '↓':
if last_accent == char:
# Apply special rules for consecutive same accents
if char == '↑':
# Mark 0 before the second '↑'
if prev_char_index >= 0:
marks[prev_char_index] = '0'
elif char == '↓':
# Mark 1 before the second '↓'
if prev_char_index >= 0:
marks[prev_char_index] = '1'
else:
# At the start, determine the initial mark based on the first accent
if current_mark is None:
current_mark = '0' if char == '↑' else '1'
# Set the current mark after the accent
current_mark = '1' if char == '↑' else '0'
last_accent = char
elif char in ['_', '?']:
# For '_' and '?', append the current mark
marks.append(current_mark)
prev_char_index = len(marks) - 1
else:
# Regular character, append the current mark
if current_mark is None:
# If no accent encountered yet, look-ahead to determine the starting mark
for j in range(i, len(chars)):
if chars[j] == '↑':
current_mark = '0'
break
elif chars[j] == '↓':
current_mark = '1'
break
marks.append(current_mark)
prev_char_index = len(marks) - 1
i += 1
# Convert the list of marks to a string
result = ''.join(marks)
return result
def katakana_normalize(s):
return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
# Example usage
# input_str = '^γƒˆβ†“γ‚·γ‚³γƒ‹#ワ↑タシワ_γƒ›β†“γƒœ#γƒžβ†“γ‚€γƒ‹γƒ_γ‚ͺ↑ニ↓むソンニ#γƒŠβ†‘γ‚―γƒ€γ‚·γ‚ͺ#γ‚΅β†‘γƒ¬γƒ†γƒžγ‚Ήγ‚·$'
# output = parse_pitch_accent(input_str)
# output_str = katakana_normalize(input_str)
# print(output_str)
# assert len(output) == len(output_str)
# for i in range(len(output)):
# print(f"{output_str[i]}: {output[i]}")