def parse_pitch_accent(s): | |
# Remove '^', '#', and '$', keep '_', '?' | |
s = s.replace('^', '').replace('#', '').replace('$', '') | |
marks = [] # List to store the binary marks | |
current_mark = None # Current mark (0 or 1) | |
last_accent = None # 'β' or 'β' or None | |
prev_char_index = -1 # Index of the previous character (not an accent marker) | |
chars = list(s) # List of characters from the string | |
i = 0 | |
while i < len(chars): | |
char = chars[i] | |
if char == 'β' or char == 'β': | |
if last_accent == char: | |
# Apply special rules for consecutive same accents | |
if char == 'β': | |
# Mark 0 before the second 'β' | |
if prev_char_index >= 0: | |
marks[prev_char_index] = '0' | |
elif char == 'β': | |
# Mark 1 before the second 'β' | |
if prev_char_index >= 0: | |
marks[prev_char_index] = '1' | |
else: | |
# At the start, determine the initial mark based on the first accent | |
if current_mark is None: | |
current_mark = '0' if char == 'β' else '1' | |
# Set the current mark after the accent | |
current_mark = '1' if char == 'β' else '0' | |
last_accent = char | |
elif char in ['_', '?']: | |
# For '_' and '?', append the current mark | |
marks.append(current_mark) | |
prev_char_index = len(marks) - 1 | |
else: | |
# Regular character, append the current mark | |
if current_mark is None: | |
# If no accent encountered yet, look-ahead to determine the starting mark | |
for j in range(i, len(chars)): | |
if chars[j] == 'β': | |
current_mark = '0' | |
break | |
elif chars[j] == 'β': | |
current_mark = '1' | |
break | |
marks.append(current_mark) | |
prev_char_index = len(marks) - 1 | |
i += 1 | |
# Convert the list of marks to a string | |
result = ''.join(marks) | |
return result | |
def katakana_normalize(s): | |
return s.replace("^", "").replace("#", "").replace("β", "").replace("β", "").replace("$", "") | |
# Example usage | |
# input_str = '^γβγ·γ³γ#γ―βγΏγ·γ―_γβγ#γβγ€γγ_γͺβγβγ€γ½γ³γ#γβγ―γγ·γͺ#γ΅βγ¬γγγΉγ·$' | |
# output = parse_pitch_accent(input_str) | |
# output_str = katakana_normalize(input_str) | |
# print(output_str) | |
# assert len(output) == len(output_str) | |
# for i in range(len(output)): | |
# print(f"{output_str[i]}: {output[i]}") |