Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
# Split Roman pali words into syllables | |
# It splits correctly for most of the words, but not all. | |
# Update: https://github.com/vpnry/palieasyread | |
# version 0.0.2 | |
import string | |
import re | |
from collections import OrderedDict | |
import gradio as gr # huggingface demo | |
# app logic | |
# v0.0.3 | |
#-------- modify these 3 values to your choice | |
my_word_divider = ' _ ' | |
my_syllable_divider = ' ' | |
my_show_origin = True | |
# below is the app logic | |
vowel_str = 'a,ā,i,ī,u,ū,e,o' | |
vowels = vowel_str.split(',') | |
vowels += vowel_str.upper().split(',') | |
# asp_consonants = 'ch,jh,kh,gh,th,ṭh,dh,ḍh,bh,ph'.split(',') | |
escape_xh = OrderedDict([ | |
# Myanmar number 1->0 | |
('kh', '၁'), | |
('gh', '၂'), | |
('ch', '၃'), | |
('jh', '၄'), | |
('th', '၅'), | |
('ṭh', '၆'), | |
('dh', '၇'), | |
('ḍh', '၈'), | |
('ph', '၉'), | |
('bh', '၀'), | |
('vh', '$'), | |
# pariyogāḷhadhammo => pa ri yo gā ḷha dham mo | |
('ḷh', '¢'), | |
# gārayhā => gā ra yhā | |
('yh', '£'), | |
('br', '€'), | |
('by', '¥')]) | |
final_manual_fix = OrderedDict([ | |
('K@h', 'Kh'), | |
('G@h', 'Gh'), | |
('C@h', 'Ch'), | |
('J@h', 'Jh'), | |
('T@h', 'Th'), | |
('Ṭ@h', 'Ṭh'), | |
('D@h', 'Dh'), | |
('Ḍ@h', 'Ḍh'), | |
('P@h', 'Ph'), | |
('B@h', 'Bh'), | |
('V@h', 'Vh'), | |
('Ḷ@h', 'Ḷh'), | |
('Y@h', 'Yh'), | |
('B@r', 'Br'), | |
('B@y', 'By'), | |
# Manually replace | |
('D@v', 'Dv'), | |
# khadv | |
('d@v', '@dv'), | |
('t@v', '@tv'), | |
('s@v', '@sv'), | |
('t@r', '@tr') | |
]) | |
not_allow_divs = [v for k, v in escape_xh.items()] | |
not_allow_divs.append('@') | |
rex_nonWord = re.compile(r'\W+') | |
def add_div_consonant(word): | |
word_ = word.strip('@1234567890' + string.punctuation + string.whitespace) | |
if not word_: | |
return word | |
# like kkh =>k-kh etc | |
three = re.compile( | |
r'([^aāiīuūeo])(ch|jh|kh|gh|th|ṭh|dh|ḍh|bh|ph)', | |
re.IGNORECASE) | |
three_con = re.findall(three, word) | |
if three_con: | |
for tup in three_con: | |
w = tup[0] + tup[1] | |
rw = tup[0] + '@' + tup[1] | |
word = word.replace(w, rw) | |
for k, v in escape_xh.items(): | |
word = word.replace(k, str(v)) | |
# like nn =>n-n etc | |
two = re.compile( | |
r'([^.aāiīuūeo1234567890@])([^.aāiīuūeo1234567890@])', | |
re.IGNORECASE) | |
two_con = re.findall(two, word) | |
if two_con: | |
for tup in two_con: | |
w = tup[0] + tup[1] | |
rw = tup[0] + '@' + tup[1] | |
word = word.replace(w, rw) | |
# restore escaped ?h | |
for k, v in escape_xh.items(): | |
word = word.replace(str(v), k) | |
return word | |
def manual_fix_chunk(word): | |
rex = re.compile(r'@([^aāiīuūeo])@', re.IGNORECASE) | |
# @t@ => t@ | |
word = re.sub(rex, r'\1@', word) | |
# fix misc PTT html | |
word = word.replace('@,', ',') | |
word = word.replace('@.', '.') | |
word = word.replace('@;', ';') | |
word = word.replace('@ṃ', 'ṃ') | |
word = word.replace('@ṁ', 'ṁ') | |
word = word.replace('‘@‘', '‘‘') | |
word = word.replace('’@’', '’’') | |
word = word.replace('‘@', '‘') | |
for k, v in final_manual_fix.items(): | |
word = word.replace(k, str(v)) | |
return word.strip('@') | |
def split_syl_word(word): | |
if len(word) <= 2: | |
return word | |
word = add_div_consonant(word) | |
chunk = '' | |
chars = [char for char in word] | |
lenChar = len(chars) | |
for i in range(lenChar): | |
if re.match(rex_nonWord, chars[i]): | |
chunk += chars[i] | |
continue | |
if chars[i] == '@': | |
chunk += chars[i] | |
continue | |
if chars[i] not in vowels: | |
chunk += chars[i] | |
# consider a valid syllable after meeting a vowel | |
# it works for most of the words. | |
else: | |
chunk += chars[i] + '@' | |
chunk = chunk.strip('@') | |
return manual_fix_chunk(chunk) | |
def check_div_collision(word_div, syl_div): | |
divs = word_div.strip() + syl_div.strip() | |
for i in not_allow_divs: | |
if i in divs: | |
return True | |
return False | |
def easy_read(text, word_div=' _ ', show_origin=True, syl_div=' '): | |
error_div = check_div_collision(word_div, syl_div) | |
if error_div: | |
print( | |
'Error: word_div or syl_div must not contain these chars\n', | |
not_allow_divs) | |
print('Please use other dividers.') | |
return '' | |
res = '' | |
lines = text.strip().splitlines() | |
for line in lines: | |
line_chunk = '' | |
if not line: | |
res += '\n' | |
continue | |
words = line.strip().split(' ') | |
for word in words: | |
syls = split_syl_word(word) | |
if syls.strip(): | |
line_chunk += syls + word_div | |
line_chunk = line_chunk.strip(' ' + word_div) | |
if word_div == '] [': | |
line_chunk = f'[{line_chunk}]' | |
if show_origin: | |
res += f'{line}\n{line_chunk}\n' | |
else: | |
res += f'\n{line_chunk}\n' | |
if syl_div != '@': | |
res = res.replace('@', syl_div) | |
# fix misc double word_div | |
di = word_div.strip() | |
double_word_div = f' {di} {di} ' | |
one_word_div = f' {di} ' | |
res = res.replace(double_word_div, one_word_div) | |
return res.strip() | |
# -------- huggingface demo -------- | |
def hf_demo(text, word_div=' _ ', show_origin=True, syl_div=' '): | |
res = easy_read(text, word_div=word_div, show_origin=show_origin, syl_div=syl_div) | |
return res | |
iface = gr.Interface( | |
# Thus iface code snippet is based on example code of | |
# https://huggingface.co/facebook/m2m100_1.2B | |
fn=hf_demo, | |
title="Pali Easy Read", | |
description="Split Roman pali words into syllables", | |
inputs=gr.inputs.Textbox(lines=5, placeholder="Enter Pali Text"), | |
outputs="text") | |
iface.launch() |