Spaces:
Sleeping
Sleeping
AlzbetaStrompova
commited on
Commit
•
92a6f43
1
Parent(s):
1709ba8
add additional gazetteers
Browse files- app.py +20 -38
- website_script.py +29 -6
app.py
CHANGED
@@ -1,42 +1,20 @@
|
|
1 |
import json
|
2 |
import gradio as gr
|
3 |
-
from website_script import load, run
|
4 |
|
5 |
-
tokenizer, model = load()
|
6 |
-
gazetteers_for_matching = gaz()
|
7 |
|
8 |
examples = [
|
9 |
-
"Masarykova univerzita se nachází v Brně .",
|
10 |
-
"Barack Obama navštívil Prahu minulý týden .",
|
11 |
-
"Angela Merkelová se setkala s francouzským prezidentem v Paříži .",
|
12 |
-
"Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
|
13 |
]
|
14 |
|
15 |
-
def add_gazetteers(new_gazetteers):
|
16 |
-
global gazetteers_for_matching
|
17 |
-
for key, value_lst in new_gazetteers.items():
|
18 |
-
key = key.upper()
|
19 |
-
for dictionary in gazetteers_for_matching:
|
20 |
-
if key in dictionary.values():
|
21 |
-
for value in value_lst:
|
22 |
-
dictionary[value] = key
|
23 |
-
|
24 |
-
def ner(text):
|
25 |
-
for d in gazetteers_for_matching:
|
26 |
-
print(len(d))
|
27 |
-
result = run(tokenizer, model, gazetteers_for_matching, text)
|
28 |
-
return {"text": text, "entities": result}
|
29 |
-
|
30 |
-
def load_gazetters(file_names):
|
31 |
-
print(file_names)
|
32 |
-
# Assuming you have a JSON file named 'data.json'
|
33 |
-
for file_name in file_names:
|
34 |
-
with open(file_name, 'r') as file:
|
35 |
-
data = json.load(file)
|
36 |
-
gazetteers_for_matching = add_gazetteers(data)
|
37 |
-
|
38 |
-
|
39 |
|
|
|
|
|
|
|
40 |
|
41 |
with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
|
42 |
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
@@ -47,14 +25,18 @@ with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(pr
|
|
47 |
examples=examples,
|
48 |
title="NerROB-czech",
|
49 |
description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
|
50 |
-
allow_flagging="never")
|
51 |
-
|
52 |
-
gr.Interface(load_gazetters,
|
53 |
-
gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
|
54 |
-
None,
|
55 |
allow_flagging="never",
|
56 |
-
|
57 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if __name__ == "__main__":
|
60 |
demo.launch()
|
|
|
1 |
import json
|
2 |
import gradio as gr
|
3 |
+
from website_script import load, run
|
4 |
|
5 |
+
tokenizer, model, gazetteers_for_matching = load()
|
|
|
6 |
|
7 |
examples = [
|
8 |
+
["Masarykova univerzita se nachází v Brně .", None],
|
9 |
+
["Barack Obama navštívil Prahu minulý týden .", None],
|
10 |
+
["Angela Merkelová se setkala s francouzským prezidentem v Paříži .", None],
|
11 |
+
["Nobelova cena za fyziku byla udělena týmu vědců z MIT .", None]
|
12 |
]
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
def ner(text, file_names):
|
16 |
+
result = run(tokenizer, model, gazetteers_for_matching, text, file_names)
|
17 |
+
return {"text": text, "entities": result}
|
18 |
|
19 |
with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
|
20 |
# with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
25 |
examples=examples,
|
26 |
title="NerROB-czech",
|
27 |
description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
|
|
|
|
|
|
|
|
|
|
|
28 |
allow_flagging="never",
|
29 |
+
additional_inputs=gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
|
30 |
+
additional_inputs_accordion=gr.Accordion(label="Additional Gazetters", open=False)
|
31 |
+
)
|
32 |
+
|
33 |
+
# gr.Interface(load_gazetters,
|
34 |
+
# gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
|
35 |
+
# None,
|
36 |
+
# allow_flagging="never",
|
37 |
+
# description="Here you can upload your own gazetteers.", )
|
38 |
+
# btn = gr.Button(value="Change Language")
|
39 |
+
# btn.click(reload)
|
40 |
|
41 |
if __name__ == "__main__":
|
42 |
demo.launch()
|
website_script.py
CHANGED
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
from transformers import AutoTokenizer
|
3 |
|
@@ -9,27 +12,47 @@ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
|
|
9 |
def load():
|
10 |
model_name = "ufal/robeczech-base"
|
11 |
model_path = "bettystr/NerRoB-czech"
|
|
|
12 |
|
13 |
model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
|
14 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
15 |
model.eval()
|
16 |
-
return tokenizer, model
|
17 |
-
|
18 |
-
def gaz():
|
19 |
-
gazetteers_path = "gazz2.json"
|
20 |
|
21 |
gazetteers_for_matching = load_gazetteers(gazetteers_path)
|
22 |
temp = []
|
23 |
for i in gazetteers_for_matching.keys():
|
24 |
temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
|
25 |
gazetteers_for_matching = temp
|
26 |
-
return gazetteers_for_matching
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
def run(tokenizer, model,
|
|
|
29 |
|
30 |
tokenized_inputs = tokenizer(
|
31 |
text, truncation=True, is_split_into_words=False, return_offsets_mapping=True
|
32 |
)
|
|
|
|
|
33 |
matches = gazetteer_matching(text, gazetteers_for_matching)
|
34 |
new_g = []
|
35 |
word_ids = tokenized_inputs.word_ids()
|
|
|
1 |
+
import json
|
2 |
+
import copy
|
3 |
+
|
4 |
import torch
|
5 |
from transformers import AutoTokenizer
|
6 |
|
|
|
12 |
def load():
|
13 |
model_name = "ufal/robeczech-base"
|
14 |
model_path = "bettystr/NerRoB-czech"
|
15 |
+
gazetteers_path = "gazz2.json"
|
16 |
|
17 |
model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
|
18 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
19 |
model.eval()
|
|
|
|
|
|
|
|
|
20 |
|
21 |
gazetteers_for_matching = load_gazetteers(gazetteers_path)
|
22 |
temp = []
|
23 |
for i in gazetteers_for_matching.keys():
|
24 |
temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
|
25 |
gazetteers_for_matching = temp
|
26 |
+
return tokenizer, model, gazetteers_for_matching
|
27 |
+
|
28 |
+
|
29 |
+
def add_additional_gazetteers(gazetteers_for_matching, file_names):
|
30 |
+
if file_names is None or file_names == []:
|
31 |
+
return gazetteers_for_matching
|
32 |
+
temp = []
|
33 |
+
for l1 in gazetteers_for_matching:
|
34 |
+
d2 = copy.deepcopy(l1)
|
35 |
+
temp.append(d2)
|
36 |
+
for file_name in file_names:
|
37 |
+
with open(file_name, 'r') as file:
|
38 |
+
data = json.load(file)
|
39 |
+
for key, value_lst in data.items():
|
40 |
+
key = key.upper()
|
41 |
+
for dictionary in temp:
|
42 |
+
if key in dictionary.values():
|
43 |
+
for value in value_lst:
|
44 |
+
dictionary[value] = key
|
45 |
+
return temp
|
46 |
+
|
47 |
|
48 |
+
def run(tokenizer, model, gazetteers, text, file_names=None):
|
49 |
+
gazetteers_for_matching = add_additional_gazetteers(gazetteers, file_names)
|
50 |
|
51 |
tokenized_inputs = tokenizer(
|
52 |
text, truncation=True, is_split_into_words=False, return_offsets_mapping=True
|
53 |
)
|
54 |
+
for i in gazetteers_for_matching:
|
55 |
+
print(len(i))
|
56 |
matches = gazetteer_matching(text, gazetteers_for_matching)
|
57 |
new_g = []
|
58 |
word_ids = tokenized_inputs.word_ids()
|