|
from typing import List, Union |
|
from pythainlp.tokenize import subword_tokenize,word_tokenize |
|
from pythainlp.util import sound_syllable |
|
from pythainlp.util import remove_tonemark |
|
from pythainlp.khavee import KhaveeVerifier |
|
import pythainlp as pythai |
|
from pythainlp.tokenize import word_tokenize |
|
from pythainlp.tokenize import subword_tokenize |
|
from pythainlp.util import sound_syllable |
|
from pythainlp.util import isthai |
|
from pythainlp.transliterate import pronunciate |
|
from pythainlp.spell import correct |
|
from tqdm import tqdm |
|
import numpy as np |
|
import pandas as pd |
|
kv = KhaveeVerifier() |
|
from transformers import AutoTokenizer, TFAutoModelForCausalLM |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Thanravee/KarveeSaimai", local_files_only=False) |
|
model = TFAutoModelForCausalLM.from_pretrained("Thanravee/KarveeSaimai", local_files_only=False) |
|
|
|
|
|
def split_klong(klong_text): |
|
splitted_klong = [] |
|
klong_list = klong_text.split('-') |
|
klong_list = [klong for klong in klong_list if klong.strip()] |
|
for i in range(len(klong_list)): |
|
if i == 1 or i == 3 or i == 5: |
|
klong = klong_list[i] |
|
if klong[0] == ' ': |
|
klong = klong[1:] |
|
klong = klong.split(' ') |
|
splitted_klong.append(klong[0]) |
|
else: |
|
splitted_klong.append(klong_list[i].replace(' ', '')) |
|
return splitted_klong |
|
|
|
|
|
def subword_token(wak, engine='ssg'): |
|
subword_tokenized = subword_tokenize(wak, engine='ssg') |
|
if len(subword_tokenized) != 5 and len(subword_tokenized) != 2: |
|
subword_tokenized = subword_tokenize(wak, engine='dict') |
|
return subword_tokenized |
|
|
|
|
|
|
|
def subword_num(splitted_klong): |
|
checked = [] |
|
two = [1,3,5] |
|
five = [0,2,4,6] |
|
for num in range(len(splitted_klong)): |
|
if num in two: |
|
checked.append(len(subword_token(splitted_klong[num])) == 2) |
|
elif num in five: |
|
checked.append(len(subword_token(splitted_klong[num])) == 5) |
|
elif num == 7: |
|
checked.append(len(subword_token(splitted_klong[num])) == 4) |
|
return checked |
|
|
|
|
|
def find_tone(word): |
|
char_list = [*word] |
|
if "่" in char_list or sound_syllable(word) == 'dead': |
|
return "eak or dead" |
|
elif "้" in char_list: |
|
return "tou" |
|
else: |
|
return False |
|
|
|
|
|
def check_eaktou(splitted_klong): |
|
checked = [] |
|
for num in range(len(splitted_klong)): |
|
tokenzied_wak = subword_token(splitted_klong[num]) |
|
if num == 0: |
|
checked.append(find_tone(tokenzied_wak[3]) == "eak or dead" and find_tone(tokenzied_wak[4]) == 'tou') |
|
elif num == 1: |
|
checked.append(True) |
|
elif num == 2: |
|
checked.append(find_tone(tokenzied_wak[1]) == "eak or dead") |
|
elif num == 3: |
|
checked.append(find_tone(tokenzied_wak[0]) == 'eak or dead' and find_tone(tokenzied_wak[1]) == 'tou') |
|
elif num == 4: |
|
checked.append(find_tone(tokenzied_wak[2]) == 'eak or dead') |
|
elif num == 5: |
|
checked.append(find_tone(tokenzied_wak[1]) == 'eak or dead') |
|
elif num == 6: |
|
checked.append(find_tone(tokenzied_wak[1]) == "eak or dead" and find_tone(tokenzied_wak[4]) == 'tou') |
|
elif num == 7: |
|
checked.append(find_tone(tokenzied_wak[0]) == "eak or dead" and find_tone(tokenzied_wak[1]) == 'tou') |
|
return checked |
|
|
|
|
|
|
|
def sound_words(splitted_klong): |
|
sound_list = [] |
|
for wak in splitted_klong: |
|
list_char = [*wak] |
|
if " " in list_char: |
|
wak = wak.split(" ") |
|
wak = wak[0] |
|
wak = word_tokenize(wak, engine="newmm") |
|
pronounce_word = pronunciate(wak[-1], engine="w2p") |
|
sound_list.append(pronounce_word.replace('ฺ', '').split('-')[-1]) |
|
return sound_list |
|
|
|
|
|
|
|
def check_sampas(sound_list): |
|
checked = [] |
|
if len(sound_list) > 2: |
|
checked.append(kv.is_sumpus(sound_list[1],sound_list[2])) |
|
if len(sound_list) > 4: |
|
checked.append(kv.is_sumpus(sound_list[1],sound_list[4])) |
|
if len(sound_list) > 6: |
|
checked.append(kv.is_sumpus(sound_list[3],sound_list[6])) |
|
else: |
|
checked.append(True) |
|
return checked |
|
|
|
def main_check(klong_text): |
|
splitted_klong = split_klong(klong_text) |
|
checked_subword_num = subword_num(splitted_klong) |
|
if False in checked_subword_num: |
|
false_index = checked_subword_num.index(False) |
|
return 'syllable format error', false_index+1 |
|
else: |
|
checked_eaktou = check_eaktou(splitted_klong) |
|
if False in checked_eaktou: |
|
false_index = checked_eaktou.index(False) |
|
return 'eaktou format error', false_index+1 |
|
else: |
|
sound_list = sound_words(splitted_klong) |
|
checked_sampas = check_sampas(sound_list) |
|
if False in checked_sampas: |
|
wak_sampas = ['2 and 3', '2 and 5', '4 and 7'] |
|
return 'sampas format error', wak_sampas[checked_sampas.index(False)] |
|
else: |
|
return True |
|
|
|
|
|
def gen_prob_next_token(text:str, model, tokenizer): |
|
input_ids = tokenizer(text, return_tensors="pt") |
|
|
|
input_ids,input_ids['input_ids'].shape |
|
|
|
|
|
outputs = model(input_ids['input_ids']) |
|
logits = outputs.logits |
|
logits.shape |
|
|
|
|
|
import torch.nn.functional as F |
|
probs = F.softmax(logits[:, -1, :], dim=-1).squeeze() |
|
probs, probs.argmax() |
|
|
|
|
|
import pandas as pd |
|
df = pd.DataFrame(tokenizer.vocab.items(), columns=['token', 'token_id']).sort_values('token_id').reset_index(drop=True) |
|
|
|
df['prob'] = probs.detach().numpy() |
|
|
|
possible_token = df.sort_values('prob',ascending=False).reset_index() |
|
thai_only = [x if isthai(x) else None for x in possible_token['token']] |
|
possible_token['token'] = thai_only |
|
possible_token = possible_token.dropna() |
|
return possible_token |
|
|
|
|
|
|
|
def gen_rules(probs, fast_gen=True): |
|
passed = [] |
|
limiter = 5 if fast_gen else 100000000 |
|
for prob in probs: |
|
if len(check_word(prob)) > 1 and len(subword_token(prob)) == 1 and '-' not in pronunciate(prob) and len(passed) <= limiter: |
|
passed.append(correct(prob)) |
|
return passed |
|
|
|
|
|
def check_word(word): |
|
alphabets = [alp for alp in [*word] if alp not in ['่','้','๊','๋','์']] |
|
if '์' in [*word]: |
|
alphabets = [*word][:-2] |
|
return alphabets |
|
|
|
def generator(klong): |
|
prob = gen_prob_next_token(klong, model, tokenizer) |
|
new_prob = gen_rules(prob['token'].tolist()) |
|
return new_prob |
|
|
|
|
|
def get_sampassed(data:list, sampaswith): |
|
passed = [] |
|
counter_exception = 0 |
|
for possible_word in tqdm(data): |
|
possible_sampas = pronunciate(possible_word).split('-')[-1] |
|
sampaswith = pronunciate(sampaswith).split('-')[-1] |
|
try: |
|
if kv.is_sumpus(possible_sampas, sampaswith): |
|
passed.append(possible_word) |
|
except IndexError: |
|
counter_exception += 1 |
|
continue |
|
assert len(passed) != counter_exception |
|
return passed |
|
|
|
|
|
def get_aek_too(data:list, ktype='aek'): |
|
passed = [] |
|
for possible_word in tqdm(data): |
|
if kv.check_aek_too(possible_word) == ktype: |
|
passed.append(possible_word) |
|
return passed |
|
|
|
def tone_gen(klong_text, gened_word, word_mark='no', sampas=False): |
|
splitted_klong = split_klong(klong_text) |
|
if word_mark == 'no' and sampas == False: |
|
probs = generator(klong_text) |
|
for prob in probs: |
|
if prob not in gened_word: |
|
gened_word.append(prob) |
|
return prob, gened_word |
|
elif word_mark == 'aek' and sampas == False: |
|
probs = generator(klong_text) |
|
aek = get_aek_too(probs) |
|
for prob in aek: |
|
if prob not in gened_word: |
|
gened_word.append(prob) |
|
return prob, gened_word |
|
elif word_mark == 'too' and sampas == False: |
|
probs = generator(klong_text) |
|
too = get_aek_too(probs, 'too') |
|
for prob in too: |
|
if prob not in gened_word: |
|
gened_word.append(prob) |
|
return prob, gened_word |
|
elif sampas == True and word_mark == 'no': |
|
probs = gen_prob_next_token(klong_text, model, tokenizer) |
|
probs = probs['token'][:500] |
|
passed = get_sampassed(probs, sound_words(splitted_klong)[1]) |
|
for prob in passed: |
|
if prob not in gened_word: |
|
gened_word.append(prob) |
|
return prob, gened_word |
|
elif sampas == True and word_mark == 'too': |
|
probs = gen_prob_next_token(klong_text, model, tokenizer) |
|
probs = probs['token'][:500] |
|
passed = get_sampassed(probs, sound_words(splitted_klong)[3]) |
|
for prob in passed: |
|
if prob not in gened_word and kv.check_aek_too(prob) == 'too': |
|
gened_word.append(prob) |
|
return prob, gened_word |
|
|
|
def gen_klong(klong_text_input, gened_word): |
|
splitted_klong = split_klong(klong_text_input) |
|
klong_text = klong_text_input |
|
|
|
if len(splitted_klong) in [1, 3, 5]: |
|
word_gen = 2 |
|
if len(splitted_klong) == 1: |
|
|
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '-' |
|
elif len(splitted_klong) == 3: |
|
|
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='aek') |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word, 'too') |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '-' |
|
elif len(splitted_klong) == 5: |
|
|
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='aek') |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '-' |
|
|
|
|
|
elif len(splitted_klong) in [2, 4, 6]: |
|
word_gen = 5 |
|
if len(splitted_klong) == 2: |
|
|
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='aek') |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
sampas_word = sound_words(splitted_klong)[1] |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='no', sampas=True) |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '-' |
|
elif len(splitted_klong) == 4: |
|
|
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='aek') |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
sampas_word = sound_words(splitted_klong)[1] |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='no', sampas=True) |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '-' |
|
elif len(splitted_klong) == 6: |
|
|
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='aek') |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
sampas_word = sound_words(splitted_klong)[1] |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='too', sampas=True) |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '-' |
|
|
|
elif len(splitted_klong) == 7: |
|
|
|
word_gen = 4 |
|
prob, gened_word = tone_gen(klong_text, gened_word, word_mark='aek') |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word, 'too') |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
prob, gened_word = tone_gen(klong_text, gened_word) |
|
klong_text = klong_text + prob |
|
klong_text = klong_text + '\n' |
|
return klong_text, gened_word |
|
|
|
|
|
def main(klong_text): |
|
gened_klong = [] |
|
splitted = split_klong(klong_text) |
|
if main_check(klong_text) == True: |
|
wak_num = len(splitted) |
|
klong_text, gened_klong = gen_klong(klong_text, gened_klong) |
|
return klong_text |
|
else: |
|
return main_check(klong_text) |
|
|
|
import gradio as gr |
|
|
|
iface = gr.Interface(fn=main, inputs="text", outputs="text") |
|
iface.launch(share=True, debug=True) |