palieasyread / app.py
dpc's picture
Update app.py
5351643 verified
# -*- coding: utf-8 -*-
# Split Roman pali words into syllables
# It splits correctly for most of the words, but not all.
# Update: https://github.com/vpnry/palieasyread
# version 0.0.2
import string
import re
from collections import OrderedDict
import gradio as gr # huggingface demo
# app logic
# v0.0.3
#-------- modify these 3 values to your choice
my_word_divider = ' _ '
my_syllable_divider = ' '
my_show_origin = True
# below is the app logic
vowel_str = 'a,ā,i,ī,u,ū,e,o'
vowels = vowel_str.split(',')
vowels += vowel_str.upper().split(',')
# asp_consonants = 'ch,jh,kh,gh,th,ṭh,dh,ḍh,bh,ph'.split(',')
escape_xh = OrderedDict([
# Myanmar number 1->0
('kh', '၁'),
('gh', '၂'),
('ch', '၃'),
('jh', '၄'),
('th', '၅'),
('ṭh', '၆'),
('dh', '၇'),
('ḍh', '၈'),
('ph', '၉'),
('bh', '၀'),
('vh', '$'),
# pariyogāḷhadhammo => pa ri yo gā ḷha dham mo
('ḷh', '¢'),
# gārayhā => gā ra yhā
('yh', '£'),
('br', '€'),
('by', '¥')])
final_manual_fix = OrderedDict([
('K@h', 'Kh'),
('G@h', 'Gh'),
('C@h', 'Ch'),
('J@h', 'Jh'),
('T@h', 'Th'),
('Ṭ@h', 'Ṭh'),
('D@h', 'Dh'),
('Ḍ@h', 'Ḍh'),
('P@h', 'Ph'),
('B@h', 'Bh'),
('V@h', 'Vh'),
('Ḷ@h', 'Ḷh'),
('Y@h', 'Yh'),
('B@r', 'Br'),
('B@y', 'By'),
# Manually replace
('D@v', 'Dv'),
# khadv
('d@v', '@dv'),
('t@v', '@tv'),
('s@v', '@sv'),
('t@r', '@tr')
])
not_allow_divs = [v for k, v in escape_xh.items()]
not_allow_divs.append('@')
rex_nonWord = re.compile(r'\W+')
def add_div_consonant(word):
word_ = word.strip('@1234567890' + string.punctuation + string.whitespace)
if not word_:
return word
# like kkh =>k-kh etc
three = re.compile(
r'([^aāiīuūeo])(ch|jh|kh|gh|th|ṭh|dh|ḍh|bh|ph)',
re.IGNORECASE)
three_con = re.findall(three, word)
if three_con:
for tup in three_con:
w = tup[0] + tup[1]
rw = tup[0] + '@' + tup[1]
word = word.replace(w, rw)
for k, v in escape_xh.items():
word = word.replace(k, str(v))
# like nn =>n-n etc
two = re.compile(
r'([^.aāiīuūeo1234567890@])([^.aāiīuūeo1234567890@])',
re.IGNORECASE)
two_con = re.findall(two, word)
if two_con:
for tup in two_con:
w = tup[0] + tup[1]
rw = tup[0] + '@' + tup[1]
word = word.replace(w, rw)
# restore escaped ?h
for k, v in escape_xh.items():
word = word.replace(str(v), k)
return word
def manual_fix_chunk(word):
rex = re.compile(r'@([^aāiīuūeo])@', re.IGNORECASE)
# @t@ => t@
word = re.sub(rex, r'\1@', word)
# fix misc PTT html
word = word.replace('@,', ',')
word = word.replace('@.', '.')
word = word.replace('@;', ';')
word = word.replace('@ṃ', 'ṃ')
word = word.replace('@ṁ', 'ṁ')
word = word.replace('‘@‘', '‘‘')
word = word.replace('’@’', '’’')
word = word.replace('‘@', '‘')
for k, v in final_manual_fix.items():
word = word.replace(k, str(v))
return word.strip('@')
def split_syl_word(word):
if len(word) <= 2:
return word
word = add_div_consonant(word)
chunk = ''
chars = [char for char in word]
lenChar = len(chars)
for i in range(lenChar):
if re.match(rex_nonWord, chars[i]):
chunk += chars[i]
continue
if chars[i] == '@':
chunk += chars[i]
continue
if chars[i] not in vowels:
chunk += chars[i]
# consider a valid syllable after meeting a vowel
# it works for most of the words.
else:
chunk += chars[i] + '@'
chunk = chunk.strip('@')
return manual_fix_chunk(chunk)
def check_div_collision(word_div, syl_div):
divs = word_div.strip() + syl_div.strip()
for i in not_allow_divs:
if i in divs:
return True
return False
def easy_read(text, word_div=' _ ', show_origin=True, syl_div=' '):
error_div = check_div_collision(word_div, syl_div)
if error_div:
print(
'Error: word_div or syl_div must not contain these chars\n',
not_allow_divs)
print('Please use other dividers.')
return ''
res = ''
lines = text.strip().splitlines()
for line in lines:
line_chunk = ''
if not line:
res += '\n'
continue
words = line.strip().split(' ')
for word in words:
syls = split_syl_word(word)
if syls.strip():
line_chunk += syls + word_div
line_chunk = line_chunk.strip(' ' + word_div)
if word_div == '] [':
line_chunk = f'[{line_chunk}]'
if show_origin:
res += f'{line}\n{line_chunk}\n'
else:
res += f'\n{line_chunk}\n'
if syl_div != '@':
res = res.replace('@', syl_div)
# fix misc double word_div
di = word_div.strip()
double_word_div = f' {di} {di} '
one_word_div = f' {di} '
res = res.replace(double_word_div, one_word_div)
return res.strip()
# -------- huggingface demo --------
def hf_demo(text, word_div=' _ ', show_origin=True, syl_div=' '):
res = easy_read(text, word_div=word_div, show_origin=show_origin, syl_div=syl_div)
return res
iface = gr.Interface(
# Thus iface code snippet is based on example code of
# https://huggingface.co/facebook/m2m100_1.2B
fn=hf_demo,
title="Pali Easy Read",
description="Split Roman pali words into syllables",
inputs=gr.inputs.Textbox(lines=5, placeholder="Enter Pali Text"),
outputs="text")
iface.launch()