File size: 2,767 Bytes
0888de7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
def parse_pitch_accent(s):
# Remove '^', '#', and '$', keep '_', '?'
s = s.replace('^', '').replace('#', '').replace('$', '')
marks = [] # List to store the binary marks
current_mark = None # Current mark (0 or 1)
last_accent = None # 'β' or 'β' or None
prev_char_index = -1 # Index of the previous character (not an accent marker)
chars = list(s) # List of characters from the string
i = 0
while i < len(chars):
char = chars[i]
if char == 'β' or char == 'β':
if last_accent == char:
# Apply special rules for consecutive same accents
if char == 'β':
# Mark 0 before the second 'β'
if prev_char_index >= 0:
marks[prev_char_index] = '0'
elif char == 'β':
# Mark 1 before the second 'β'
if prev_char_index >= 0:
marks[prev_char_index] = '1'
else:
# At the start, determine the initial mark based on the first accent
if current_mark is None:
current_mark = '0' if char == 'β' else '1'
# Set the current mark after the accent
current_mark = '1' if char == 'β' else '0'
last_accent = char
elif char in ['_', '?']:
# For '_' and '?', append the current mark
marks.append(current_mark)
prev_char_index = len(marks) - 1
else:
# Regular character, append the current mark
if current_mark is None:
# If no accent encountered yet, look-ahead to determine the starting mark
for j in range(i, len(chars)):
if chars[j] == 'β':
current_mark = '0'
break
elif chars[j] == 'β':
current_mark = '1'
break
marks.append(current_mark)
prev_char_index = len(marks) - 1
i += 1
# Convert the list of marks to a string
result = ''.join(marks)
return result
def katakana_normalize(s):
return s.replace("^", "").replace("#", "").replace("β", "").replace("β", "").replace("$", "")
# Example usage
# input_str = '^γβγ·γ³γ#γ―βγΏγ·γ―_γβγ#γβγ€γγ_γͺβγβγ€γ½γ³γ#γβγ―γγ·γͺ#γ΅βγ¬γγγΉγ·$'
# output = parse_pitch_accent(input_str)
# output_str = katakana_normalize(input_str)
# print(output_str)
# assert len(output) == len(output_str)
# for i in range(len(output)):
# print(f"{output_str[i]}: {output[i]}") |