Spaces:

bettystr
/

NerRoB-czech

Sleeping

App Files Files Community

AlzbetaStrompova commited on May 10

Commit

92a6f43

•

1 Parent(s): 1709ba8

add additional gazetteers

Browse files

Files changed (2) hide show

app.py +20 -38
website_script.py +29 -6

app.py CHANGED Viewed

@@ -1,42 +1,20 @@
 import json
 import gradio as gr
-from website_script import load, run, gaz
-tokenizer, model = load()
-gazetteers_for_matching = gaz()
 examples = [
-    "Masarykova univerzita se nachází v Brně .",
-    "Barack Obama navštívil Prahu minulý týden .",
-    "Angela Merkelová se setkala s francouzským prezidentem v Paříži .",
-    "Nobelova cena za fyziku byla udělena týmu vědců z MIT ."
 ]
-def add_gazetteers(new_gazetteers):
-    global gazetteers_for_matching
-    for key, value_lst in new_gazetteers.items():
-        key = key.upper()
-        for dictionary in gazetteers_for_matching:
-            if key in dictionary.values():
-                for value in value_lst:
-                    dictionary[value] = key
-def ner(text):
-    for d in gazetteers_for_matching:
-        print(len(d))
-    result = run(tokenizer, model, gazetteers_for_matching, text)
-    return {"text": text, "entities": result}
-def load_gazetters(file_names):
-    print(file_names)
-    # Assuming you have a JSON file named 'data.json'
-    for file_name in file_names:
-        with open(file_name, 'r') as file:
-            data = json.load(file)
-            gazetteers_for_matching = add_gazetteers(data)
 with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
 # with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -47,14 +25,18 @@ with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(pr
         examples=examples,
         title="NerROB-czech",
         description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
-        allow_flagging="never")
-    gr.Interface(load_gazetters,
-        gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
-        None,
         allow_flagging="never",
-        description="Here you can upload your own gazetteers.",
-)
 if __name__ == "__main__":
     demo.launch()

 import json
 import gradio as gr
+from website_script import load, run
+tokenizer, model, gazetteers_for_matching = load()
 examples = [
+    ["Masarykova univerzita se nachází v Brně .", None],
+    ["Barack Obama navštívil Prahu minulý týden .", None],
+    ["Angela Merkelová se setkala s francouzským prezidentem v Paříži .", None],
+    ["Nobelova cena za fyziku byla udělena týmu vědců z MIT .", None]
 ]
+def ner(text, file_names):
+    result = run(tokenizer, model, gazetteers_for_matching, text, file_names)
+    return {"text": text, "entities": result}
 with gr.Blocks(css="footer{display:none !important}", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
 # with gr.Blocks(theme=gr.themes.Soft()) as demo:
         examples=examples,
         title="NerROB-czech",
         description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
         allow_flagging="never",
+        additional_inputs=gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
+        additional_inputs_accordion=gr.Accordion(label="Additional Gazetters", open=False)
+        )
+    # gr.Interface(load_gazetters,
+    #     gr.File(label="Upload a JSON file", file_count="multiple", file_types=[".json"]),
+    #     None,
+    #     allow_flagging="never",
+    #     description="Here you can upload your own gazetteers.", )
+    # btn = gr.Button(value="Change Language")
+    # btn.click(reload)
 if __name__ == "__main__":
     demo.launch()

website_script.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import torch
 from transformers import AutoTokenizer
@@ -9,27 +12,47 @@ from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
 def load():
     model_name = "ufal/robeczech-base"
     model_path = "bettystr/NerRoB-czech"
     model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()
-    return tokenizer, model
-def gaz():
-    gazetteers_path = "gazz2.json"
     gazetteers_for_matching = load_gazetteers(gazetteers_path)
     temp = []
     for i in gazetteers_for_matching.keys():
         temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
     gazetteers_for_matching = temp
-    return gazetteers_for_matching
-def run(tokenizer, model, gazetteers_for_matching, text):
     tokenized_inputs = tokenizer(
         text, truncation=True, is_split_into_words=False, return_offsets_mapping=True
     )
     matches = gazetteer_matching(text, gazetteers_for_matching)
     new_g = []
     word_ids = tokenized_inputs.word_ids()

+import json
+import copy
 import torch
 from transformers import AutoTokenizer
 def load():
     model_name = "ufal/robeczech-base"
     model_path = "bettystr/NerRoB-czech"
+    gazetteers_path = "gazz2.json"
     model = ExtendedEmbeddigsRobertaForTokenClassification.from_pretrained(model_path).to("cpu")
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()
     gazetteers_for_matching = load_gazetteers(gazetteers_path)
     temp = []
     for i in gazetteers_for_matching.keys():
         temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
     gazetteers_for_matching = temp
+    return tokenizer, model, gazetteers_for_matching
+def add_additional_gazetteers(gazetteers_for_matching, file_names):
+    if file_names is None or file_names == []:
+        return gazetteers_for_matching
+    temp = []
+    for l1 in gazetteers_for_matching:
+        d2 = copy.deepcopy(l1)
+        temp.append(d2)
+    for file_name in file_names:
+        with open(file_name, 'r') as file:
+            data = json.load(file)
+        for key, value_lst in data.items():
+            key = key.upper()
+            for dictionary in temp:
+                if key in dictionary.values():
+                    for value in value_lst:
+                        dictionary[value] = key
+    return temp
+def run(tokenizer, model, gazetteers, text, file_names=None):
+    gazetteers_for_matching = add_additional_gazetteers(gazetteers, file_names)
     tokenized_inputs = tokenizer(
         text, truncation=True, is_split_into_words=False, return_offsets_mapping=True
     )
+    for i in gazetteers_for_matching:
+        print(len(i))
     matches = gazetteer_matching(text, gazetteers_for_matching)
     new_g = []
     word_ids = tokenized_inputs.word_ids()