TTS-Indonesiaku-Gratis / g2pid /syllable_splitter.py
Deddy's picture
Upload 10 files
44d88a1 verified
# Copied from https://github.com/fahadh4ilyas/syllable_splitter
# MIT License
import re
class SyllableSplitter:
def __init__(self):
self.consonant = set(
[
"b",
"c",
"d",
"f",
"g",
"h",
"j",
"k",
"l",
"m",
"n",
"p",
"q",
"r",
"s",
"t",
"v",
"w",
"x",
"y",
"z",
"ng",
"ny",
"sy",
"ch",
"dh",
"gh",
"kh",
"ph",
"sh",
"th",
]
)
self.double_consonant = set(["ll", "ks", "rs", "rt", "nk", "nd"])
self.vocal = set(["a", "e", "ê", "é", "è", "i", "o", "u"])
def split_letters(self, string):
letters = []
arrange = []
while string != "":
letter = string[:2]
if letter in self.double_consonant:
if string[2:] != "" and string[2] in self.vocal:
letters += [letter[0]]
arrange += ["c"]
string = string[1:]
else:
letters += [letter]
arrange += ["c"]
string = string[2:]
elif letter in self.consonant:
letters += [letter]
arrange += ["c"]
string = string[2:]
elif letter in self.vocal:
letters += [letter]
arrange += ["v"]
string = string[2:]
else:
letter = string[0]
if letter in self.consonant:
letters += [letter]
arrange += ["c"]
string = string[1:]
elif letter in self.vocal:
letters += [letter]
arrange += ["v"]
string = string[1:]
else:
letters += [letter]
arrange += ["s"]
string = string[1:]
return letters, "".join(arrange)
def split_syllables_from_letters(self, letters, arrange):
consonant_index = re.search(r"vc{2,}", arrange)
while consonant_index:
i = consonant_index.start() + 1
letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
consonant_index = re.search(r"vc{2,}", arrange)
vocal_index = re.search(r"v{2,}", arrange)
while vocal_index:
i = vocal_index.start()
letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
vocal_index = re.search(r"v{2,}", arrange)
vcv_index = re.search(r"vcv", arrange)
while vcv_index:
i = vcv_index.start()
letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
vcv_index = re.search(r"vcv", arrange)
sep_index = re.search(r"[cvs]s", arrange)
while sep_index:
i = sep_index.start()
letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
sep_index = re.search(r"[cvs]s", arrange)
sep_index = re.search(r"s[cvs]", arrange)
while sep_index:
i = sep_index.start()
letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
sep_index = re.search(r"s[cvs]", arrange)
return "".join(letters).split("|")
def split_syllables(self, string):
letters, arrange = self.split_letters(string)
return self.split_syllables_from_letters(letters, arrange)