AlzbetaStrompova commited on
Commit
92a6f43
1 Parent(s): 1709ba8

add additional gazetteers

Browse files
Files changed (2) hide show
  1. app.py +20 -38
  2. website_script.py +29 -6
app.py CHANGED
@@ -1,42 +1,20 @@
1
  import json
2
  import gradio as gr
3
- from website_script import load, run, gaz
4
 
5
- tokenizer, model = load()
6
- gazetteers_for_matching = gaz()
7
 
8
  examples = [
9
- "Masarykova univerzita se nachází v Brně .",
10
- "Barack Obama navštívil Prahu minulý týden .",
11
- "Angela Merkelová se setkala s francouzským prezidentem v Paříži .",
12
- "Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
13
  ]
14
 
15
- def add_gazetteers(new_gazetteers):
16
- global gazetteers_for_matching
17
- for key, value_lst in new_gazetteers.items():
18
- key = key.upper()
19
- for dictionary in gazetteers_for_matching:
20
- if key in dictionary.values():
21
- for value in value_lst:
22
- dictionary[value] = key
23
-
24
- def ner(text):
25
- for d in gazetteers_for_matching:
26
- print(len(d))
27
- result = run(tokenizer, model, gazetteers_for_matching, text)
28
- return {"text": text, "entities": result}
29
-
30
- def load_gazetters(file_names):
31
- print(file_names)
32
- # Assuming you have a JSON file named 'data.json'
33
- for file_name in file_names:
34
- with open(file_name, 'r') as file:
35
- data = json.load(file)
36
- gazetteers_for_matching = add_gazetteers(data)
37
-
38
-
39
 
 
 
 
40
 
41
  with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
42
  # with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -47,14 +25,18 @@ with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(pr
47
  examples=examples,
48
  title="NerROB-czech",
49
  description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
50
- allow_flagging="never")
51
-
52
- gr.Interface(load_gazetters,
53
- gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
54
- None,
55
  allow_flagging="never",
56
- description="Here you can upload your own gazetteers.",
57
- )
 
 
 
 
 
 
 
 
 
58
 
59
  if __name__ == "__main__":
60
  demo.launch()
 
1
  import json
2
  import gradio as gr
3
+ from website_script import load, run
4
 
5
+ tokenizer, model, gazetteers_for_matching = load()
 
6
 
7
  examples = [
8
+ ["Masarykova univerzita se nachází v Brně .", None],
9
+ ["Barack Obama navštívil Prahu minulý týden .", None],
10
+ ["Angela Merkelová se setkala s francouzským prezidentem v Paříži .", None],
11
+ ["Nobelova cena za fyziku byla udělena týmu vědců z MIT .", None]
12
  ]
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ def ner(text, file_names):
16
+ result = run(tokenizer, model, gazetteers_for_matching, text, file_names)
17
+ return {"text": text, "entities": result}
18
 
19
  with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
20
  # with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
25
  examples=examples,
26
  title="NerROB-czech",
27
  description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
 
 
 
 
 
28
  allow_flagging="never",
29
+ additional_inputs=gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
30
+ additional_inputs_accordion=gr.Accordion(label="Additional Gazetters", open=False)
31
+ )
32
+
33
+ # gr.Interface(load_gazetters,
34
+ # gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
35
+ # None,
36
+ # allow_flagging="never",
37
+ # description="Here you can upload your own gazetteers.", )
38
+ # btn = gr.Button(value="Change Language")
39
+ # btn.click(reload)
40
 
41
  if __name__ == "__main__":
42
  demo.launch()
website_script.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import torch
2
  from transformers import AutoTokenizer
3
 
@@ -9,27 +12,47 @@ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
9
  def load():
10
  model_name = "ufal/robeczech-base"
11
  model_path = "bettystr/NerRoB-czech"
 
12
 
13
  model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
14
  tokenizer = AutoTokenizer.from_pretrained(model_name)
15
  model.eval()
16
- return tokenizer, model
17
-
18
- def gaz():
19
- gazetteers_path = "gazz2.json"
20
 
21
  gazetteers_for_matching = load_gazetteers(gazetteers_path)
22
  temp = []
23
  for i in gazetteers_for_matching.keys():
24
  temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
25
  gazetteers_for_matching = temp
26
- return gazetteers_for_matching
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- def run(tokenizer, model, gazetteers_for_matching, text):
 
29
 
30
  tokenized_inputs = tokenizer(
31
  text, truncation=True, is_split_into_words=False, return_offsets_mapping=True
32
  )
 
 
33
  matches = gazetteer_matching(text, gazetteers_for_matching)
34
  new_g = []
35
  word_ids = tokenized_inputs.word_ids()
 
1
+ import json
2
+ import copy
3
+
4
  import torch
5
  from transformers import AutoTokenizer
6
 
 
12
  def load():
13
  model_name = "ufal/robeczech-base"
14
  model_path = "bettystr/NerRoB-czech"
15
+ gazetteers_path = "gazz2.json"
16
 
17
  model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
18
  tokenizer = AutoTokenizer.from_pretrained(model_name)
19
  model.eval()
 
 
 
 
20
 
21
  gazetteers_for_matching = load_gazetteers(gazetteers_path)
22
  temp = []
23
  for i in gazetteers_for_matching.keys():
24
  temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
25
  gazetteers_for_matching = temp
26
+ return tokenizer, model, gazetteers_for_matching
27
+
28
+
29
+ def add_additional_gazetteers(gazetteers_for_matching, file_names):
30
+ if file_names is None or file_names == []:
31
+ return gazetteers_for_matching
32
+ temp = []
33
+ for l1 in gazetteers_for_matching:
34
+ d2 = copy.deepcopy(l1)
35
+ temp.append(d2)
36
+ for file_name in file_names:
37
+ with open(file_name, 'r') as file:
38
+ data = json.load(file)
39
+ for key, value_lst in data.items():
40
+ key = key.upper()
41
+ for dictionary in temp:
42
+ if key in dictionary.values():
43
+ for value in value_lst:
44
+ dictionary[value] = key
45
+ return temp
46
+
47
 
48
+ def run(tokenizer, model, gazetteers, text, file_names=None):
49
+ gazetteers_for_matching = add_additional_gazetteers(gazetteers, file_names)
50
 
51
  tokenized_inputs = tokenizer(
52
  text, truncation=True, is_split_into_words=False, return_offsets_mapping=True
53
  )
54
+ for i in gazetteers_for_matching:
55
+ print(len(i))
56
  matches = gazetteer_matching(text, gazetteers_for_matching)
57
  new_g = []
58
  word_ids = tokenized_inputs.word_ids()