Fix import errors, lint
Browse files- app.py +37 -36
- requirements.txt +1 -1
app.py
CHANGED
@@ -1,21 +1,18 @@
|
|
1 |
-
from pathlib import Path
|
2 |
from functools import partial
|
|
|
3 |
|
4 |
-
|
|
|
5 |
from joeynmt.helpers import (
|
6 |
-
check_version,
|
7 |
load_checkpoint,
|
8 |
load_config,
|
9 |
parse_train_args,
|
10 |
resolve_ckpt_path,
|
11 |
-
|
12 |
)
|
13 |
from joeynmt.model import build_model
|
|
|
14 |
from joeynmt.tokenizers import build_tokenizer
|
15 |
from joeynmt.vocabulary import build_vocab
|
16 |
-
from joeynmt.datasets import build_dataset
|
17 |
-
|
18 |
-
import gradio as gr
|
19 |
|
20 |
languages_scripts = {
|
21 |
"Azeri Turkish in Persian": "AzeriTurkish-Persian",
|
@@ -29,45 +26,44 @@ languages_scripts = {
|
|
29 |
"Mazandarani in Persian": "Mazandarani-Persian",
|
30 |
"Northern Kurdish in Arabic": "Kurmanji-Arabic",
|
31 |
"Northern Kurdish in Persian": "Kurmanji-Persian",
|
32 |
-
"Sindhi in Urdu": "Sindhi-Urdu"
|
33 |
}
|
34 |
|
|
|
35 |
def normalize(text, language_script):
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
cfg = load_config(Path(cfg_file))
|
41 |
-
|
42 |
-
model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(
|
43 |
-
cfg["training"], mode="prediction")
|
44 |
test_cfg = cfg["testing"]
|
45 |
src_cfg = cfg["data"]["src"]
|
46 |
trg_cfg = cfg["data"]["trg"]
|
47 |
-
|
48 |
load_model = load_model if ckpt is None else Path(ckpt)
|
49 |
ckpt = resolve_ckpt_path(load_model, model_dir)
|
50 |
-
|
51 |
src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
|
52 |
-
|
53 |
model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
|
54 |
-
|
55 |
# load model state from disk
|
56 |
model_checkpoint = load_checkpoint(ckpt, device=device)
|
57 |
model.load_state_dict(model_checkpoint["model_state"])
|
58 |
-
|
59 |
if device.type == "cuda":
|
60 |
model.to(device)
|
61 |
-
|
62 |
tokenizer = build_tokenizer(cfg["data"])
|
63 |
sequence_encoder = {
|
64 |
src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
|
65 |
trg_cfg["lang"]: None,
|
66 |
}
|
67 |
-
|
68 |
test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1
|
69 |
test_cfg["batch_type"] = "sentence"
|
70 |
-
|
71 |
test_data = build_dataset(
|
72 |
dataset_type="stream",
|
73 |
path=None,
|
@@ -79,7 +75,7 @@ def normalize(text, language_script):
|
|
79 |
)
|
80 |
test_data.set_item(text.strip())
|
81 |
|
82 |
-
cfg=test_cfg
|
83 |
_, _, hypotheses, trg_tokens, trg_scores, _ = predict(
|
84 |
model=model,
|
85 |
data=test_data,
|
@@ -91,8 +87,10 @@ def normalize(text, language_script):
|
|
91 |
cfg=cfg,
|
92 |
fp16=fp16,
|
93 |
)
|
|
|
94 |
return hypotheses[0]
|
95 |
|
|
|
96 |
title = """
|
97 |
<center><strong><font size='8'>Script Normalization for Unconventional Writing<font></strong></center>
|
98 |
|
@@ -125,22 +123,25 @@ description = """
|
|
125 |
"""
|
126 |
|
127 |
examples = [
|
128 |
-
[
|
|
|
|
|
|
|
129 |
["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
|
130 |
["یکیک له جوانیکانی ام شاره جوانه", "Central Kurdish in Persian"],
|
131 |
["نمک درهٰ مردوم گيلک ايسن ؤ اوشان زوان ني گيلکي ايسه .", "Gilaki in Persian"],
|
132 |
-
["شؤنةو اانةيةرة گةشت و گلي ناجارانةو اؤجالاني دةستش پنةكةرد", "Gorani in Arabic"],
|
133 |
-
["ڕوٙو زوانی ئەذایی چەنی پەیذابی ؟", "Gorani in Central Kurdish"],
|
134 |
-
["هنگامکان ظميٛ ر چمان ، بپا کريٛلي بيشان :", "Gorani in Persian"],
|
135 |
-
["ربعی بن افکل اُسے اَکھ صُحابی .", "Kashmiri in Urdu"],
|
136 |
-
["اینتا زون گنشکرون 85 میلیون نفر هسن", "Mazandarani in Persian"],
|
137 |
-
["بة رطكا هة صطئن ژ دل هاطة بة لافكرن", "Northern Kurdish in Arabic"],
|
138 |
-
["ثرکى همرنگ نرميني دويت هندک قوناغين دي ببريت", "Northern Kurdish in Persian"],
|
139 |
-
["ہتی کجھ اپ ۽ تمام دائون ترینون بیھندیون آھن .", "Sindhi in Urdu"]
|
140 |
]
|
141 |
|
142 |
|
143 |
-
article =
|
144 |
<div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
|
145 |
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
|
146 |
<b>Created and deployed by Sina Ahmadi <a href="https://sinaahmadi.github.io/">(https://sinaahmadi.github.io/)</a>.
|
@@ -152,14 +153,14 @@ demo = gr.Interface(
|
|
152 |
title=title,
|
153 |
description=description,
|
154 |
fn=normalize,
|
155 |
-
inputs
|
156 |
gr.inputs.Textbox(lines=4, label="Noisy Text \U0001F974"),
|
157 |
gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
|
158 |
],
|
159 |
outputs=gr.outputs.Textbox(label="Normalized Text \U0001F642"),
|
160 |
examples=examples,
|
161 |
article=article,
|
162 |
-
examples_per_page=20
|
163 |
)
|
164 |
|
165 |
demo.launch()
|
|
|
|
|
1 |
from functools import partial
|
2 |
+
from pathlib import Path
|
3 |
|
4 |
+
import gradio as gr
|
5 |
+
from joeynmt.datasets import build_dataset
|
6 |
from joeynmt.helpers import (
|
|
|
7 |
load_checkpoint,
|
8 |
load_config,
|
9 |
parse_train_args,
|
10 |
resolve_ckpt_path,
|
|
|
11 |
)
|
12 |
from joeynmt.model import build_model
|
13 |
+
from joeynmt.prediction import predict
|
14 |
from joeynmt.tokenizers import build_tokenizer
|
15 |
from joeynmt.vocabulary import build_vocab
|
|
|
|
|
|
|
16 |
|
17 |
languages_scripts = {
|
18 |
"Azeri Turkish in Persian": "AzeriTurkish-Persian",
|
|
|
26 |
"Mazandarani in Persian": "Mazandarani-Persian",
|
27 |
"Northern Kurdish in Arabic": "Kurmanji-Arabic",
|
28 |
"Northern Kurdish in Persian": "Kurmanji-Persian",
|
29 |
+
"Sindhi in Urdu": "Sindhi-Urdu",
|
30 |
}
|
31 |
|
32 |
+
|
33 |
def normalize(text, language_script):
|
34 |
+
cfg_file = "./models/%s/config.yaml" % languages_scripts[language_script]
|
35 |
+
ckpt = "./models/%s/best.ckpt" % languages_scripts[language_script]
|
36 |
+
|
|
|
37 |
cfg = load_config(Path(cfg_file))
|
38 |
+
# parse and validate cfg
|
39 |
+
model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(cfg["training"], mode="prediction")
|
|
|
40 |
test_cfg = cfg["testing"]
|
41 |
src_cfg = cfg["data"]["src"]
|
42 |
trg_cfg = cfg["data"]["trg"]
|
43 |
+
|
44 |
load_model = load_model if ckpt is None else Path(ckpt)
|
45 |
ckpt = resolve_ckpt_path(load_model, model_dir)
|
46 |
+
|
47 |
src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
|
48 |
+
|
49 |
model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
|
50 |
+
|
51 |
# load model state from disk
|
52 |
model_checkpoint = load_checkpoint(ckpt, device=device)
|
53 |
model.load_state_dict(model_checkpoint["model_state"])
|
54 |
+
|
55 |
if device.type == "cuda":
|
56 |
model.to(device)
|
57 |
+
|
58 |
tokenizer = build_tokenizer(cfg["data"])
|
59 |
sequence_encoder = {
|
60 |
src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
|
61 |
trg_cfg["lang"]: None,
|
62 |
}
|
63 |
+
|
64 |
test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1
|
65 |
test_cfg["batch_type"] = "sentence"
|
66 |
+
|
67 |
test_data = build_dataset(
|
68 |
dataset_type="stream",
|
69 |
path=None,
|
|
|
75 |
)
|
76 |
test_data.set_item(text.strip())
|
77 |
|
78 |
+
cfg = test_cfg
|
79 |
_, _, hypotheses, trg_tokens, trg_scores, _ = predict(
|
80 |
model=model,
|
81 |
data=test_data,
|
|
|
87 |
cfg=cfg,
|
88 |
fp16=fp16,
|
89 |
)
|
90 |
+
|
91 |
return hypotheses[0]
|
92 |
|
93 |
+
|
94 |
title = """
|
95 |
<center><strong><font size='8'>Script Normalization for Unconventional Writing<font></strong></center>
|
96 |
|
|
|
123 |
"""
|
124 |
|
125 |
examples = [
|
126 |
+
[
|
127 |
+
"بو شهرین نوفوسو ، 2014 نجی ایلين نوفوس ساییمی اساسيندا 41 نفر ایمیش .",
|
128 |
+
"Azeri Turkish in Persian",
|
129 |
+
], # "بۇ شهرین نۆفوسو ، 2014 نجی ایلين نۆفوس ساییمی اساسيندا 41 نفر ایمیش ."
|
130 |
["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
|
131 |
["یکیک له جوانیکانی ام شاره جوانه", "Central Kurdish in Persian"],
|
132 |
["نمک درهٰ مردوم گيلک ايسن ؤ اوشان زوان ني گيلکي ايسه .", "Gilaki in Persian"],
|
133 |
+
["شؤنةو اانةيةرة گةشت و گلي ناجارانةو اؤجالاني دةستش پنةكةرد", "Gorani in Arabic"], # شۆنەو ئانەیەرە گەشت و گێڵی ناچارانەو ئۆجالانی دەستش پنەکەرد
|
134 |
+
["ڕوٙو زوانی ئەذایی چەنی پەیذابی ؟", "Gorani in Central Kurdish"], # ڕوٙو زوانی ئەڎایی چەنی پەیڎابی ؟
|
135 |
+
["هنگامکان ظميٛ ر چمان ، بپا کريٛلي بيشان :", "Gorani in Persian"], # هەنگامەکان وزمیٛ وەرو چەمان ، بەپاو کریٛڵی بیەشان :
|
136 |
+
["ربعی بن افکل اُسے اَکھ صُحابی .", "Kashmiri in Urdu"], # ربعی بن افکل ٲسؠ اَکھ صُحابی .
|
137 |
+
["اینتا زون گنشکرون 85 میلیون نفر هسن", "Mazandarani in Persian"], # اینتا زوون گِنِشکَرون 85 میلیون نفر هسنه
|
138 |
+
["بة رطكا هة صطئن ژ دل هاطة بة لافكرن", "Northern Kurdish in Arabic"], # پەرتوکا هەستێن ژ دل هاتە بەلافکرن
|
139 |
+
["ثرکى همرنگ نرميني دويت هندک قوناغين دي ببريت", "Northern Kurdish in Persian"], # سەرەکی هەمەرەنگ نەرمینێ دڤێت هندەک قوناغێن دی ببڕیت
|
140 |
+
["ہتی کجھ اپ ۽ تمام دائون ترینون بیھندیون آھن .", "Sindhi in Urdu"], # هتي ڪجھ اپ ۽ تمام ڊائون ٽرينون بيھنديون آھن .
|
141 |
]
|
142 |
|
143 |
|
144 |
+
article = """
|
145 |
<div style="text-align: justify; max-width: 1200px; margin: 20px auto;">
|
146 |
<h3 style="font-weight: 450; font-size: 1rem; margin: 0rem">
|
147 |
<b>Created and deployed by Sina Ahmadi <a href="https://sinaahmadi.github.io/">(https://sinaahmadi.github.io/)</a>.
|
|
|
153 |
title=title,
|
154 |
description=description,
|
155 |
fn=normalize,
|
156 |
+
inputs=[
|
157 |
gr.inputs.Textbox(lines=4, label="Noisy Text \U0001F974"),
|
158 |
gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
|
159 |
],
|
160 |
outputs=gr.outputs.Textbox(label="Normalized Text \U0001F642"),
|
161 |
examples=examples,
|
162 |
article=article,
|
163 |
+
examples_per_page=20,
|
164 |
)
|
165 |
|
166 |
demo.launch()
|
requirements.txt
CHANGED
@@ -1,2 +1,2 @@
|
|
1 |
gradio
|
2 |
-
|
|
|
1 |
gradio
|
2 |
+
joeynmt==2.2.0
|