SinaAhmadi commited on
Commit
4f3ec12
1 Parent(s): e9d8fdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -66
app.py CHANGED
@@ -17,56 +17,69 @@ from joeynmt.datasets import build_dataset
17
 
18
  import gradio as gr
19
 
20
- # INPUT = "سلاو لە ناو گلی کرد"
21
-
22
- cfg_file = 'config.yaml'
23
- ckpt = './models/Sorani-Arabic/best.ckpt'
24
-
25
- cfg = load_config(Path(cfg_file))
26
- # parse and validate cfg
27
- model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(
28
- cfg["training"], mode="prediction")
29
- test_cfg = cfg["testing"]
30
- src_cfg = cfg["data"]["src"]
31
- trg_cfg = cfg["data"]["trg"]
32
-
33
- load_model = load_model if ckpt is None else Path(ckpt)
34
- ckpt = resolve_ckpt_path(load_model, model_dir)
35
-
36
- src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
37
-
38
- model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
39
-
40
- # load model state from disk
41
- model_checkpoint = load_checkpoint(ckpt, device=device)
42
- model.load_state_dict(model_checkpoint["model_state"])
43
-
44
- if device.type == "cuda":
45
- model.to(device)
46
-
47
- tokenizer = build_tokenizer(cfg["data"])
48
- sequence_encoder = {
49
- src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
50
- trg_cfg["lang"]: None,
51
  }
52
 
53
- test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1
54
- test_cfg["batch_type"] = "sentence"
55
-
56
- test_data = build_dataset(
57
- dataset_type="stream",
58
- path=None,
59
- src_lang=src_cfg["lang"],
60
- trg_lang=trg_cfg["lang"],
61
- split="test",
62
- tokenizer=tokenizer,
63
- sequence_encoder=sequence_encoder,
64
- )
65
- # test_data.set_item(INPUT.rstrip())
66
-
67
-
68
- def _translate_data(test_data, cfg=test_cfg):
69
  """Translates given dataset, using parameters from outer scope."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  _, _, hypotheses, trg_tokens, trg_scores, _ = predict(
71
  model=model,
72
  data=test_data,
@@ -84,7 +97,7 @@ def _translate_data(test_data, cfg=test_cfg):
84
 
85
  def normalize(text, language_script):
86
  test_data.set_item(text)
87
- result = _translate_data(test_data)
88
  return result
89
 
90
 
@@ -106,35 +119,32 @@ description = """
106
  For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a>
107
  """
108
 
109
- languages_scripts = {
110
- "Azeri Turkish in Persian": "AzeriTurkish-Persian",
111
- "Central Kurdish in Arabic": "Sorani-Arabic",
112
- "Central Kurdish in Persian": "Sorani-Persian",
113
- "Gilaki in Persian": "Gilaki-Persian",
114
- "Gorani in Arabic": "Gorani-Arabic",
115
- "Gorani in Central Kurdish": "Gorani-Sorani",
116
- "Gorani in Persian": "Gorani-Persian",
117
- "Kashmiri in Urdu": "Kashmiri-Urdu",
118
- "Mazandarani in Persian": "Mazandarani-Persian",
119
- "Northern Kurdish in Arabic": "Kurmanji-Arabic",
120
- "Northern Kurdish in Persian": "Kurmanji-Persian",
121
- "Sindhi in Urdu": "Sindhi-Urdu"
122
- }
123
-
124
  examples = [
 
125
  ["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
126
- ["سلاو برا جونی؟", "Central Kurdish in Arabic"],
 
 
 
 
 
 
 
 
 
127
  ]
128
 
 
 
129
  demo = gr.Interface(
130
  title=title,
131
  description=description,
132
  fn=normalize,
133
  inputs = [
134
- gr.inputs.Textbox(lines=4, label="Noisy Text"),
135
  gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
136
  ],
137
- outputs=gr.outputs.Textbox(label="Normalized Text"),
138
  examples=examples
139
  )
140
 
 
17
 
18
  import gradio as gr
19
 
20
+ languages_scripts = {
21
+ "Azeri Turkish in Persian": "AzeriTurkish-Persian",
22
+ "Central Kurdish in Arabic": "Sorani-Arabic",
23
+ "Central Kurdish in Persian": "Sorani-Persian",
24
+ "Gilaki in Persian": "Gilaki-Persian",
25
+ "Gorani in Arabic": "Gorani-Arabic",
26
+ "Gorani in Central Kurdish": "Gorani-Sorani",
27
+ "Gorani in Persian": "Gorani-Persian",
28
+ "Kashmiri in Urdu": "Kashmiri-Urdu",
29
+ "Mazandarani in Persian": "Mazandarani-Persian",
30
+ "Northern Kurdish in Arabic": "Kurmanji-Arabic",
31
+ "Northern Kurdish in Persian": "Kurmanji-Persian",
32
+ "Sindhi in Urdu": "Sindhi-Urdu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  }
34
 
35
+ def _translate_data(test_data, language_script):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  """Translates given dataset, using parameters from outer scope."""
37
+ cfg_file = './models/%s/config.yaml'
38
+ ckpt = "./models/%s/best.ckpt"%languages_scripts[language_script]
39
+
40
+ cfg = load_config(Path(cfg_file))
41
+ # parse and validate cfg
42
+ model_dir, load_model, device, n_gpu, num_workers, _, fp16 = parse_train_args(
43
+ cfg["training"], mode="prediction")
44
+ test_cfg = cfg["testing"]
45
+ src_cfg = cfg["data"]["src"]
46
+ trg_cfg = cfg["data"]["trg"]
47
+
48
+ load_model = load_model if ckpt is None else Path(ckpt)
49
+ ckpt = resolve_ckpt_path(load_model, model_dir)
50
+
51
+ src_vocab, trg_vocab = build_vocab(cfg["data"], model_dir=model_dir)
52
+
53
+ model = build_model(cfg["model"], src_vocab=src_vocab, trg_vocab=trg_vocab)
54
+
55
+ # load model state from disk
56
+ model_checkpoint = load_checkpoint(ckpt, device=device)
57
+ model.load_state_dict(model_checkpoint["model_state"])
58
+
59
+ if device.type == "cuda":
60
+ model.to(device)
61
+
62
+ tokenizer = build_tokenizer(cfg["data"])
63
+ sequence_encoder = {
64
+ src_cfg["lang"]: partial(src_vocab.sentences_to_ids, bos=False, eos=True),
65
+ trg_cfg["lang"]: None,
66
+ }
67
+
68
+ test_cfg["batch_size"] = 1 # CAUTION: this will raise an error if n_gpus > 1
69
+ test_cfg["batch_type"] = "sentence"
70
+
71
+ test_data = build_dataset(
72
+ dataset_type="stream",
73
+ path=None,
74
+ src_lang=src_cfg["lang"],
75
+ trg_lang=trg_cfg["lang"],
76
+ split="test",
77
+ tokenizer=tokenizer,
78
+ sequence_encoder=sequence_encoder,
79
+ )
80
+ # test_data.set_item(INPUT.rstrip())
81
+
82
+ cfg=test_cfg
83
  _, _, hypotheses, trg_tokens, trg_scores, _ = predict(
84
  model=model,
85
  data=test_data,
 
97
 
98
  def normalize(text, language_script):
99
  test_data.set_item(text)
100
+ result = _translate_data(test_data, language_script)
101
  return result
102
 
103
 
 
119
  For more information, you can check out the project on GitHub too: <a href="https://github.com/sinaahmadi/ScriptNormalization" target="_blank"><strong>https://github.com/sinaahmadi/ScriptNormalization</strong></a>
120
  """
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  examples = [
123
+ ["بو شهرین نوفوسو ، 2014 نجی ایلين نوفوس ساییمی اساسيندا 41 نفر ایمیش .", "Azeri Turkish in Persian"],#"بۇ شهرین نۆفوسو ، 2014 نجی ایلين نۆفوس ساییمی اساسيندا 41 نفر ایمیش ."
124
  ["ياخوا تةمةن دريژبيت بوئةم ميللةتة", "Central Kurdish in Arabic"],
125
+ ["یکیک له جوانیکانی ام شاره جوانه", "Central Kurdish in Persian"],
126
+ ["نمک درهٰ مردوم گيلک ايسن ؤ اوشان زوان ني گيلکي ايسه .", "Gilaki in Persian"],
127
+ ["شؤنةو اانةيةرة گةشت و گلي ناجارانةو اؤجالاني دةستش پنةكةرد", "Gorani in Arabic"], #شۆنەو ئانەیەرە گەشت و گێڵی ناچارانەو ئۆجالانی دەستش پنەکەرد
128
+ ["ڕوٙو زوانی ئەذایی چەنی پەیذابی ؟", "Gorani in Central Kurdish"], # ڕوٙو زوانی ئەڎایی چەنی پەیڎابی ؟
129
+ ["هنگامکان ظميٛ ر چمان ، بپا کريٛلي بيشان :", "Gorani in Persian"], # هەنگامەکان وزمیٛ وەرو چەمان ، بەپاو کریٛڵی بیەشان :
130
+ ["ربعی بن افکل اُسے اَکھ صُحابی .", "Kashmiri in Urdu"], # ربعی بن افکل ٲسؠ اَکھ صُحابی .
131
+ ["اینتا زون گنشکرون 85 میلیون نفر هسن", "Mazandarani in Persian"], # اینتا زوون گِنِشکَرون 85 میلیون نفر هسنه
132
+ ["بة رطكا هة صطئن ژ دل هاطة بة لافكرن", "Northern Kurdish in Arabic"], #پەرتوکا هەستێن ژ دل هاتە بەلافکرن
133
+ ["ثرکى همرنگ نرميني دويت هندک قوناغين دي ببريت", "Northern Kurdish in Persian"], # سەرەکی هەمەرەنگ نەرمینێ دڤێت هندەک قوناغێن دی ببڕیت
134
+ ["ہتی کجھ اپ ۽ تمام دائون ترینون بیھندیون آھن .", "Sindhi in Urdu"] # هتي ڪجھ اپ ۽ تمام ڊائون ٽرينون بيھنديون آھن .
135
  ]
136
 
137
+
138
+
139
  demo = gr.Interface(
140
  title=title,
141
  description=description,
142
  fn=normalize,
143
  inputs = [
144
+ gr.inputs.Textbox(lines=4, label="Noisy Text \U0001F974"),
145
  gr.Dropdown(label="Language in unconventional script", choices=sorted(list(languages_scripts.keys()))),
146
  ],
147
+ outputs=gr.outputs.Textbox(label="Normalized Text \U0001F642"),
148
  examples=examples
149
  )
150