|
from functools import partial |
|
from pathlib import Path |
|
|
|
import gradio as gr |
|
from joeynmt.datasets import build_dataset |
|
from joeynmt.helpers import ( |
|
load_checkpoint, |
|
load_config, |
|
parse_train_args, |
|
resolve_ckpt_path, |
|
) |
|
from joeynmt.model import build_model |
|
from joeynmt.prediction import predict |
|
from joeynmt.tokenizers import build_tokenizer |
|
from joeynmt.vocabulary import build_vocab |
|
|
|
languages_scripts = { |
|
"Azeri Turkish in Persian": "AzeriTurkish-Persian", |
|
"Central Kurdish in Arabic": "Sorani-Arabic", |
|
"Central Kurdish in Persian": "Sorani-Persian", |
|
"Gilaki in Persian": "Gilaki-Persian", |
|
"Gorani in Arabic": "Gorani-Arabic", |
|
"Gorani in Central Kurdish": "Gorani-Sorani", |
|
"Gorani in Persian": "Gorani-Persian", |
|
"Kashmiri in Urdu": "Kashmiri-Urdu", |
|
"Mazandarani in Persian": "Mazandarani-Persian", |
|
"Northern Kurdish in Arabic": "Kurmanji-Arabic", |
|
"Northern Kurdish in Persian": "Kurmanji-Persian", |
|
"Sindhi in Urdu": "Sindhi-Urdu", |
|
} |
|
|
|
|
|
def normalize(text, language_script): |
|
cfg_file = "./models/%s/config.yaml" % languages_scripts[language_script] |
|
ckpt = "./models/%s/best.ckpt" % languages_scripts[language_script] |
|
|
|
cfg = load_config(Path(cfg_file)) |
|
|
|
model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(cfg["training"], mode="prediction") |
|
test_cfg = cfg["testing"] |
|
src_cfg = cfg["data"]["src"] |
|
trg_cfg = cfg["data"]["trg"] |
|
|
|
load_model = load_model if ckpt is None else Path(ckpt) |
|
ckpt = resolve_ckpt_path(load_model, model_dir) |
|
|
|
src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir) |
|
|
|
model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab) |
|
|
|
|
|
model_checkpoint = load_checkpoint(ckpt, device=device) |
|
model.load_state_dict(model_checkpoint["model_state"]) |
|
|
|
if device.type == "cuda": |
|
model.to(device) |
|
|
|
tokenizer = build_tokenizer(cfg["data"]) |
|
sequence_encoder = { |
|
src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True), |
|
trg_cfg["lang"]: None, |
|
} |
|
|
|
test_cfg["batch_size"] = 1 |
|
test_cfg["batch_type"] = "sentence" |
|
|
|
test_data = build_dataset( |
|
dataset_type="stream", |
|
path=None, |
|
src_lang=src_cfg["lang"], |
|
trg_lang=trg_cfg["lang"], |
|
split="test", |
|
tokenizer=tokenizer, |
|
sequence_encoder=sequence_encoder, |
|
) |
|
test_data.set_item(text.strip()) |
|
|
|
cfg = test_cfg |
|
_, _, hypotheses, trg_tokens, trg_scores, _ = predict( |
|
model=model, |
|
data=test_data, |
|
compute_loss=False, |
|
device=device, |
|
n_gpu=n_gpu, |
|
normalization="none", |
|
num_workers=num_workers, |
|
cfg=cfg, |
|
fp16=fp16, |
|
) |
|
|
|
return hypotheses[0] |
|
|
|
|
|
title = """ |
|
<center><strong><font size='8'>Script Normalization for Unconventional Writing<font></strong></center> |
|
|
|
<div align="center"> |
|
<img src="https://raw.githubusercontent.com/sinaahmadi/ScriptNormalization/b80b8fd9e3b77d0e58443ebd506c42173486f9a6/Perso-Arabic_scripts.jpg" alt="Perso-Arabic scripts used by the target languages in our paper" width="400"> |
|
</div> |
|
|
|
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem"> |
|
[<a href="https://sinaahmadi.github.io/docs/articles/ahmadi2023acl.pdf" style="color:blue;">Paper (ACL 2023)</a>] |
|
[<a href="https://sinaahmadi.github.io/docs/slides/ahmadi2023acl_slides.pdf" style="color:blue;">Slides</a>] |
|
[<a href="https://github.com/sinaahmadi/ScriptNormalization" style="color:blue;">GitHub</a>] |
|
[<a href="https://s3.amazonaws.com/pf-user-files-01/u-59356/uploads/2023-06-04/rw32pwp/ACL2023.mp4" style="color:blue;">Presentation</a>] |
|
</h3> |
|
""" |
|
|
|
description = """ |
|
<ul> |
|
<li style="font-size:120%;">"<em>mar7aba!</em>"</li> |
|
<li style="font-size:120%;">"<em>هاو ئار یوو؟</em>"</li> |
|
<li style="font-size:120%;">"<em>Μπιάνβενου α σετ ντεμό!</em>"</li> |
|
</ul> |
|
|
|
<p style="font-size:120%;">What do all these sentences have in common? Being greeted in Arabic with "<em>mar7aba</em>" written in the Latin script, then asked how you are ("<em>هاو ئار یوو؟</em>") in English using the Perso-Arabic script of Kurdish and then, welcomed to this demo in French ("<em>Μπιάνβενου α σετ ντεμό!</em>") written in Greek script. All these sentences are written in an <strong>unconventional</strong> script.</p> |
|
|
|
<p style="font-size:120%;">Although you may find these sentences risible, unconventional writing is a common practice among millions of speakers in bilingual communities. In our paper entitled "<a href="https://sinaahmadi.github.io/docs/articles/ahmadi2023acl.pdf" target="_blank"><strong>Script Normalization for Unconventional Writing of Under-Resourced Languages in Bilingual Communities</strong></a>", we shed light on this problem and propose an approach to normalize noisy text written in unconventional writing.</p> |
|
|
|
<p style="font-size:120%;">This demo deploys a few models that are trained for <strong>the normalization of unconventional writing</strong>. Please note that this tool is not a spell-checker and cannot correct errors beyond character normalization. For better performance, you can apply hard-coded rules on the input and then pass it to the models, hence a hybrid system.</p> |
|
|
|
<p style="font-size:120%;">For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a></p> |
|
""" |
|
|
|
examples = [ |
|
[ |
|
"بو شهرین نوفوسو ، 2014 نجی ایلين نوفوس ساییمی اساسيندا 41 نفر ایمیش .", |
|
"Azeri Turkish in Persian", |
|
], |
|
["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"], |
|
["یکیک له جوانیکانی ام شاره جوانه", "Central Kurdish in Persian"], |
|
["نمک درهٰ مردوم گيلک ايسن ؤ اوشان زوان ني گيلکي ايسه .", "Gilaki in Persian"], |
|
["شؤنةو اانةيةرة گةشت و گلي ناجارانةو اؤجالاني دةستش پنةكةرد", "Gorani in Arabic"], |
|
["ڕوٙو زوانی ئەذایی چەنی پەیذابی ؟", "Gorani in Central Kurdish"], |
|
["هنگامکان ظميٛ ر چمان ، بپا کريٛلي بيشان :", "Gorani in Persian"], |
|
["ربعی بن افکل اُسے اَکھ صُحابی .", "Kashmiri in Urdu"], |
|
["اینتا زون گنشکرون 85 میلیون نفر هسن", "Mazandarani in Persian"], |
|
["بة رطكا هة صطئن ژ دل هاطة بة لافكرن", "Northern Kurdish in Arabic"], |
|
["ثرکى همرنگ نرميني دويت هندک قوناغين دي ببريت", "Northern Kurdish in Persian"], |
|
["ہتی کجھ اپ ۽ تمام دائون ترینون بیھندیون آھن .", "Sindhi in Urdu"], |
|
] |
|
|
|
|
|
article = """ |
|
<div style="text-align: justify; max-width: 1200px; margin: 20px auto;"> |
|
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem"> |
|
<b>Created and deployed by Sina Ahmadi <a href="https://sinaahmadi.github.io/">(https://sinaahmadi.github.io/)</a>. |
|
</h3> |
|
</div> |
|
""" |
|
|
|
demo = gr.Interface( |
|
title=title, |
|
description=description, |
|
fn=normalize, |
|
inputs=[ |
|
gr.inputs.Textbox(lines=4, label="Noisy Text \U0001F974"), |
|
gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))), |
|
], |
|
outputs=gr.outputs.Textbox(label="Normalized Text \U0001F642"), |
|
examples=examples, |
|
article=article, |
|
examples_per_page=20, |
|
) |
|
|
|
demo.launch() |
|
|