Deddy commited on
Commit
44d88a1
1 Parent(s): 56b9ff7

Upload 10 files

Browse files

ganti nama folder

g2pid/.DS_Store ADDED
Binary file (6.15 kB). View file
 
g2pid/.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ .DS_Store
163
+ .backup/
164
+ .data/
g2pid/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .g2p import G2P
2
+
3
+ __version__ = "0.0.5"
g2pid/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (196 Bytes). View file
 
g2pid/__pycache__/g2p.cpython-310.pyc ADDED
Binary file (5.29 kB). View file
 
g2pid/__pycache__/syllable_splitter.cpython-310.pyc ADDED
Binary file (2.53 kB). View file
 
g2pid/data/dict.json ADDED
The diff for this file is too large to render. See raw diff
 
g2pid/g2p.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ import numpy as np
6
+ import onnxruntime
7
+ from nltk.tokenize import TweetTokenizer
8
+ from sacremoses import MosesDetokenizer
9
+
10
+ from .syllable_splitter import SyllableSplitter
11
+
12
+ ABJAD_MAPPING = {
13
+ "a": "a",
14
+ "b": "bé",
15
+ "c": "cé",
16
+ "d": "dé",
17
+ "e": "é",
18
+ "f": "èf",
19
+ "g": "gé",
20
+ "h": "ha",
21
+ "i": "i",
22
+ "j": "jé",
23
+ "k": "ka",
24
+ "l": "èl",
25
+ "m": "èm",
26
+ "n": "èn",
27
+ "o": "o",
28
+ "p": "pé",
29
+ "q": "ki",
30
+ "r": "èr",
31
+ "s": "ès",
32
+ "t": "té",
33
+ "u": "u",
34
+ "v": "vé",
35
+ "w": "wé",
36
+ "x": "èks",
37
+ "y": "yé",
38
+ "z": "zèt",
39
+ }
40
+
41
+ PHONETIC_MAPPING = {
42
+ "sy": "ʃ",
43
+ "ny": "ɲ",
44
+ "ng": "ŋ",
45
+ "dj": "dʒ",
46
+ "'": "ʔ",
47
+ "c": "tʃ",
48
+ "é": "e",
49
+ "è": "ɛ",
50
+ "ê": "ə",
51
+ "g": "ɡ",
52
+ "I": "ɪ",
53
+ "j": "dʒ",
54
+ "ô": "ɔ",
55
+ "q": "k",
56
+ "U": "ʊ",
57
+ "v": "f",
58
+ "x": "ks",
59
+ "y": "j",
60
+ }
61
+
62
+
63
+ dirname = os.path.dirname(__file__)
64
+
65
+ # Predict pronounciation with BERT Masking
66
+ # Read more: https://w11wo.github.io/posts/2022/04/predicting-phonemes-with-bert/
67
+ class Predictor:
68
+ def __init__(self, model_path):
69
+ # fmt: off
70
+ self.vocab = ['', '[UNK]', 'a', 'n', 'ê', 'e', 'i', 'r', 'k', 's', 't', 'g', 'm', 'u', 'l', 'p', 'o', 'd', 'b', 'h', 'c', 'j', 'y', 'f', 'w', 'v', 'z', 'x', 'q', '[mask]']
71
+ self.mask_token_id = self.vocab.index("[mask]")
72
+ # fmt: on
73
+ self.session = onnxruntime.InferenceSession(model_path)
74
+
75
+ def predict(self, word: str) -> str:
76
+ """
77
+ Predict the phonetic representation of a word.
78
+
79
+ Args:
80
+ word (str): The word to predict.
81
+
82
+ Returns:
83
+ str: The predicted phonetic representation of the word.
84
+ """
85
+ text = [self.vocab.index(c) if c != "e" else self.mask_token_id for c in word]
86
+ text.extend([0] * (32 - len(text))) # Pad to 32 tokens
87
+ inputs = np.array([text], dtype=np.int64)
88
+ (predictions,) = self.session.run(None, {"input_4": inputs})
89
+
90
+ # find masked idx token
91
+ _, masked_index = np.where(inputs == self.mask_token_id)
92
+
93
+ # get prediction at those masked index only
94
+ mask_prediction = predictions[0][masked_index]
95
+ predicted_ids = np.argmax(mask_prediction, axis=1)
96
+
97
+ # replace mask with predicted token
98
+ for i, idx in enumerate(masked_index):
99
+ text[idx] = predicted_ids[i]
100
+
101
+ return "".join([self.vocab[i] for i in text if i != 0])
102
+
103
+
104
+ class G2P:
105
+ def __init__(self):
106
+ self.tokenizer = TweetTokenizer()
107
+ self.detokenizer = MosesDetokenizer(lang="id")
108
+
109
+ dict_path = os.path.join(dirname, "data/dict.json")
110
+ with open(dict_path) as f:
111
+ self.dict = json.load(f)
112
+
113
+ model_path = os.path.join(dirname, "model/bert_pron.onnx")
114
+ self.predictor = Predictor(model_path)
115
+
116
+ self.syllable_splitter = SyllableSplitter()
117
+
118
+ def __call__(self, text: str) -> str:
119
+ """
120
+ Convert text to phonetic representation.
121
+
122
+ Args:
123
+ text (str): The text to convert.
124
+
125
+ Returns:
126
+ str: The phonetic representation of the text.
127
+ """
128
+ text = text.lower()
129
+ text = re.sub(r"[^ a-z0-9'\.,?!-]", "", text)
130
+ text = text.replace("-", " ")
131
+
132
+ prons = []
133
+ words = self.tokenizer.tokenize(text)
134
+ for word in words:
135
+ # PUEBI pronunciation
136
+ if word in self.dict:
137
+ pron = self.dict[word]
138
+ elif len(word) == 1 and word in ABJAD_MAPPING:
139
+ pron = ABJAD_MAPPING[word]
140
+ elif "e" not in word or not word.isalpha():
141
+ pron = word
142
+ elif "e" in word:
143
+ pron = self.predictor.predict(word)
144
+
145
+ # Replace alofon /e/ with e (temporary)
146
+ pron = pron.replace("é", "e")
147
+ pron = pron.replace("è", "e")
148
+
149
+ # Replace /x/ with /s/
150
+ if pron.startswith("x"):
151
+ pron = "s" + pron[1:]
152
+
153
+ sylls = self.syllable_splitter.split_syllables(pron)
154
+ # Decide where to put the stress
155
+ stress_loc = len(sylls) - 1
156
+ if len(sylls) > 1 and "ê" in sylls[-2]:
157
+ if "ê" in sylls[-1]:
158
+ stress_loc = len(sylls) - 2
159
+ else:
160
+ stress_loc = len(sylls)
161
+
162
+ # Apply rules on syllable basis
163
+ # All alophone are set to tense by default
164
+ # and will be changed to lax if needed
165
+ alophone = {"e": "é", "o": "o"}
166
+ alophone_map = {"i": "I", "u": "U", "e": "è", "o": "ô"}
167
+ for i, syll in enumerate(sylls, start=1):
168
+ # Put Syllable stress
169
+ if i == stress_loc:
170
+ syll = "ˈ" + syll
171
+
172
+ # Alophone syllable rules
173
+ for v in ["e", "o"]:
174
+ # Replace with lax allphone [��, ɔ] if
175
+ # in closed final syllables
176
+ if v in syll and not syll.endswith(v) and i == len(sylls):
177
+ alophone[v] = alophone_map[v]
178
+
179
+ # Alophone syllable stress rules
180
+ for v in ["i", "u"]:
181
+ # Replace with lax allphone [ɪ, ʊ] if
182
+ # in the middle of syllable without stress
183
+ # and not ends with coda nasal [m, n, ng] (except for final syllable)
184
+ if (
185
+ v in syll
186
+ and not syll.startswith("ˈ")
187
+ and not syll.endswith(v)
188
+ and (
189
+ not any(syll.endswith(x) for x in ["m", "n", "ng"])
190
+ or i == len(sylls)
191
+ )
192
+ ):
193
+ syll = syll.replace(v, alophone_map[v])
194
+
195
+ if syll.endswith("nk"):
196
+ syll = syll[:-2] + "ng"
197
+ elif syll.endswith("d"):
198
+ syll = syll[:-1] + "t"
199
+ elif syll.endswith("b"):
200
+ syll = syll[:-1] + "p"
201
+ elif syll.endswith("k") or (
202
+ syll.endswith("g") and not syll.endswith("ng")
203
+ ):
204
+ syll = syll[:-1] + "'"
205
+ sylls[i - 1] = syll
206
+
207
+ pron = "".join(sylls)
208
+ # Apply phonetic and alophone mapping
209
+ for v in alophone:
210
+ if v == "o" and pron.count("o") == 1:
211
+ continue
212
+ pron = pron.replace(v, alophone[v])
213
+ for g, p in PHONETIC_MAPPING.items():
214
+ pron = pron.replace(g, p)
215
+ pron = pron.replace("kh", "x")
216
+
217
+ prons.append(pron)
218
+ prons.append(" ")
219
+
220
+ return self.detokenizer.detokenize(prons)
g2pid/model/bert_pron.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bc9b45f1cdeff4dc473f722627e94db4e3ff0ba7a2b066e542a0fa46f49d330
3
+ size 1295867
g2pid/syllable_splitter.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from https://github.com/fahadh4ilyas/syllable_splitter
2
+ # MIT License
3
+ import re
4
+
5
+
6
+ class SyllableSplitter:
7
+ def __init__(self):
8
+ self.consonant = set(
9
+ [
10
+ "b",
11
+ "c",
12
+ "d",
13
+ "f",
14
+ "g",
15
+ "h",
16
+ "j",
17
+ "k",
18
+ "l",
19
+ "m",
20
+ "n",
21
+ "p",
22
+ "q",
23
+ "r",
24
+ "s",
25
+ "t",
26
+ "v",
27
+ "w",
28
+ "x",
29
+ "y",
30
+ "z",
31
+ "ng",
32
+ "ny",
33
+ "sy",
34
+ "ch",
35
+ "dh",
36
+ "gh",
37
+ "kh",
38
+ "ph",
39
+ "sh",
40
+ "th",
41
+ ]
42
+ )
43
+ self.double_consonant = set(["ll", "ks", "rs", "rt", "nk", "nd"])
44
+ self.vocal = set(["a", "e", "ê", "é", "è", "i", "o", "u"])
45
+
46
+ def split_letters(self, string):
47
+ letters = []
48
+ arrange = []
49
+
50
+ while string != "":
51
+ letter = string[:2]
52
+
53
+ if letter in self.double_consonant:
54
+ if string[2:] != "" and string[2] in self.vocal:
55
+ letters += [letter[0]]
56
+ arrange += ["c"]
57
+ string = string[1:]
58
+ else:
59
+ letters += [letter]
60
+ arrange += ["c"]
61
+ string = string[2:]
62
+ elif letter in self.consonant:
63
+ letters += [letter]
64
+ arrange += ["c"]
65
+ string = string[2:]
66
+ elif letter in self.vocal:
67
+ letters += [letter]
68
+ arrange += ["v"]
69
+ string = string[2:]
70
+ else:
71
+ letter = string[0]
72
+
73
+ if letter in self.consonant:
74
+ letters += [letter]
75
+ arrange += ["c"]
76
+ string = string[1:]
77
+ elif letter in self.vocal:
78
+ letters += [letter]
79
+ arrange += ["v"]
80
+ string = string[1:]
81
+ else:
82
+ letters += [letter]
83
+ arrange += ["s"]
84
+ string = string[1:]
85
+
86
+ return letters, "".join(arrange)
87
+
88
+ def split_syllables_from_letters(self, letters, arrange):
89
+ consonant_index = re.search(r"vc{2,}", arrange)
90
+ while consonant_index:
91
+ i = consonant_index.start() + 1
92
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
93
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
94
+ consonant_index = re.search(r"vc{2,}", arrange)
95
+
96
+ vocal_index = re.search(r"v{2,}", arrange)
97
+ while vocal_index:
98
+ i = vocal_index.start()
99
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
100
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
101
+ vocal_index = re.search(r"v{2,}", arrange)
102
+
103
+ vcv_index = re.search(r"vcv", arrange)
104
+ while vcv_index:
105
+ i = vcv_index.start()
106
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
107
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
108
+ vcv_index = re.search(r"vcv", arrange)
109
+
110
+ sep_index = re.search(r"[cvs]s", arrange)
111
+ while sep_index:
112
+ i = sep_index.start()
113
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
114
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
115
+ sep_index = re.search(r"[cvs]s", arrange)
116
+
117
+ sep_index = re.search(r"s[cvs]", arrange)
118
+ while sep_index:
119
+ i = sep_index.start()
120
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
121
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
122
+ sep_index = re.search(r"s[cvs]", arrange)
123
+ return "".join(letters).split("|")
124
+
125
+ def split_syllables(self, string):
126
+ letters, arrange = self.split_letters(string)
127
+ return self.split_syllables_from_letters(letters, arrange)