# -*- coding: utf-8 -*- # Split Roman pali words into syllables # It splits correctly for most of the words, but not all. # Update: https://github.com/vpnry/palieasyread # version 0.0.2 import string import re from collections import OrderedDict import gradio as gr # huggingface demo # app logic # v0.0.3 #-------- modify these 3 values to your choice my_word_divider = ' _ ' my_syllable_divider = ' ' my_show_origin = True # below is the app logic vowel_str = 'a,ā,i,ī,u,ū,e,o' vowels = vowel_str.split(',') vowels += vowel_str.upper().split(',') # asp_consonants = 'ch,jh,kh,gh,th,ṭh,dh,ḍh,bh,ph'.split(',') escape_xh = OrderedDict([ # Myanmar number 1->0 ('kh', '၁'), ('gh', '၂'), ('ch', '၃'), ('jh', '၄'), ('th', '၅'), ('ṭh', '၆'), ('dh', '၇'), ('ḍh', '၈'), ('ph', '၉'), ('bh', '၀'), ('vh', '$'), # pariyogāḷhadhammo => pa ri yo gā ḷha dham mo ('ḷh', '¢'), # gārayhā => gā ra yhā ('yh', '£'), ('br', '€'), ('by', '¥')]) final_manual_fix = OrderedDict([ ('K@h', 'Kh'), ('G@h', 'Gh'), ('C@h', 'Ch'), ('J@h', 'Jh'), ('T@h', 'Th'), ('Ṭ@h', 'Ṭh'), ('D@h', 'Dh'), ('Ḍ@h', 'Ḍh'), ('P@h', 'Ph'), ('B@h', 'Bh'), ('V@h', 'Vh'), ('Ḷ@h', 'Ḷh'), ('Y@h', 'Yh'), ('B@r', 'Br'), ('B@y', 'By'), # Manually replace ('D@v', 'Dv'), # khadv ('d@v', '@dv'), ('t@v', '@tv'), ('s@v', '@sv'), ('t@r', '@tr') ]) not_allow_divs = [v for k, v in escape_xh.items()] not_allow_divs.append('@') rex_nonWord = re.compile(r'\W+') def add_div_consonant(word): word_ = word.strip('@1234567890' + string.punctuation + string.whitespace) if not word_: return word # like kkh =>k-kh etc three = re.compile( r'([^aāiīuūeo])(ch|jh|kh|gh|th|ṭh|dh|ḍh|bh|ph)', re.IGNORECASE) three_con = re.findall(three, word) if three_con: for tup in three_con: w = tup[0] + tup[1] rw = tup[0] + '@' + tup[1] word = word.replace(w, rw) for k, v in escape_xh.items(): word = word.replace(k, str(v)) # like nn =>n-n etc two = re.compile( r'([^.aāiīuūeo1234567890@])([^.aāiīuūeo1234567890@])', re.IGNORECASE) two_con = re.findall(two, word) if two_con: for tup in two_con: w = tup[0] + tup[1] rw = tup[0] + '@' + tup[1] word = word.replace(w, rw) # restore escaped ?h for k, v in escape_xh.items(): word = word.replace(str(v), k) return word def manual_fix_chunk(word): rex = re.compile(r'@([^aāiīuūeo])@', re.IGNORECASE) # @t@ => t@ word = re.sub(rex, r'\1@', word) # fix misc PTT html word = word.replace('@,', ',') word = word.replace('@.', '.') word = word.replace('@;', ';') word = word.replace('@ṃ', 'ṃ') word = word.replace('@ṁ', 'ṁ') word = word.replace('‘@‘', '‘‘') word = word.replace('’@’', '’’') word = word.replace('‘@', '‘') for k, v in final_manual_fix.items(): word = word.replace(k, str(v)) return word.strip('@') def split_syl_word(word): if len(word) <= 2: return word word = add_div_consonant(word) chunk = '' chars = [char for char in word] lenChar = len(chars) for i in range(lenChar): if re.match(rex_nonWord, chars[i]): chunk += chars[i] continue if chars[i] == '@': chunk += chars[i] continue if chars[i] not in vowels: chunk += chars[i] # consider a valid syllable after meeting a vowel # it works for most of the words. else: chunk += chars[i] + '@' chunk = chunk.strip('@') return manual_fix_chunk(chunk) def check_div_collision(word_div, syl_div): divs = word_div.strip() + syl_div.strip() for i in not_allow_divs: if i in divs: return True return False def easy_read(text, word_div=' _ ', show_origin=True, syl_div=' '): error_div = check_div_collision(word_div, syl_div) if error_div: print( 'Error: word_div or syl_div must not contain these chars\n', not_allow_divs) print('Please use other dividers.') return '' res = '' lines = text.strip().splitlines() for line in lines: line_chunk = '' if not line: res += '\n' continue words = line.strip().split(' ') for word in words: syls = split_syl_word(word) if syls.strip(): line_chunk += syls + word_div line_chunk = line_chunk.strip(' ' + word_div) if word_div == '] [': line_chunk = f'[{line_chunk}]' if show_origin: res += f'{line}\n{line_chunk}\n' else: res += f'\n{line_chunk}\n' if syl_div != '@': res = res.replace('@', syl_div) # fix misc double word_div di = word_div.strip() double_word_div = f' {di} {di} ' one_word_div = f' {di} ' res = res.replace(double_word_div, one_word_div) return res.strip() # -------- huggingface demo -------- def hf_demo(text, word_div=' _ ', show_origin=True, syl_div=' '): res = easy_read(text, word_div=word_div, show_origin=show_origin, syl_div=syl_div) return res iface = gr.Interface( # Thus iface code snippet is based on example code of # https://huggingface.co/facebook/m2m100_1.2B fn=hf_demo, title="Pali Easy Read", description="Split Roman pali words into syllables", inputs=gr.inputs.Textbox(lines=5, placeholder="Enter Pali Text"), outputs="text") iface.launch()