Spaces:

AkitoP
/

whisper-japanese-phone-demo

Running

App Files Files Community

whisper-japanese-phone-demo / parse_accent.py

AkitoP

transform

0888de7 3 months ago

raw

history blame

2.77 kB

	def parse_pitch_accent(s):
	# Remove '^', '#', and '$', keep '_', '?'
	s = s.replace('^', '').replace('#', '').replace('$', '')

	marks = [] # List to store the binary marks
	current_mark = None # Current mark (0 or 1)
	last_accent = None # '↑' or '↓' or None
	prev_char_index = -1 # Index of the previous character (not an accent marker)
	chars = list(s) # List of characters from the string

	i = 0
	while i < len(chars):
	char = chars[i]
	if char == '↑' or char == '↓':
	if last_accent == char:
	# Apply special rules for consecutive same accents
	if char == '↑':
	# Mark 0 before the second '↑'
	if prev_char_index >= 0:
	marks[prev_char_index] = '0'
	elif char == '↓':
	# Mark 1 before the second '↓'
	if prev_char_index >= 0:
	marks[prev_char_index] = '1'
	else:
	# At the start, determine the initial mark based on the first accent
	if current_mark is None:
	current_mark = '0' if char == '↑' else '1'
	# Set the current mark after the accent
	current_mark = '1' if char == '↑' else '0'
	last_accent = char
	elif char in ['_', '?']:
	# For '_' and '?', append the current mark
	marks.append(current_mark)
	prev_char_index = len(marks) - 1
	else:
	# Regular character, append the current mark
	if current_mark is None:
	# If no accent encountered yet, look-ahead to determine the starting mark
	for j in range(i, len(chars)):
	if chars[j] == '↑':
	current_mark = '0'
	break
	elif chars[j] == '↓':
	current_mark = '1'
	break
	marks.append(current_mark)
	prev_char_index = len(marks) - 1
	i += 1
	# Convert the list of marks to a string
	result = ''.join(marks)
	return result
	def katakana_normalize(s):
	return s.replace("^", "").replace("#", "").replace("↑", "").replace("↓", "").replace("$", "")
	# Example usage
	# input_str = '^ト↓シコニ#ワ↑タシワ_ホ↓ボ#マ↓イニチ_オ↑ニ↓イソンニ#ナ↑クダシオ#サ↑レテマスシ$'
	# output = parse_pitch_accent(input_str)
	# output_str = katakana_normalize(input_str)
	# print(output_str)
	# assert len(output) == len(output_str)
	# for i in range(len(output)):
	# print(f"{output_str[i]}: {output[i]}")