Spaces:

ner4archives
/

ner4archives-NEL-vizualizer-app

Running

App Files Files Community

lterriel commited on Jul 29, 2022

Commit

af23812

1 Parent(s): e15cd3b

test

Browse files

Files changed (4) hide show

.gitattributes +3 -0
app.py +63 -30
requirements.txt +1 -0
samples/FRAN_IR_050370.xml +0 -0

.gitattributes CHANGED Viewed

@@ -29,3 +29,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+assets/*.png filter=lfs diff=lfs merge=lfs -text
+assets filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -8,27 +8,50 @@ from spacy.tokens import Doc
 streamlit.set_page_config(layout="wide")
-default_text = "c'est un texte de test"
 # TITLE APP
 streamlit.title("NER4Archives visualizer")
-# DESCRIPTION APP
-streamlit.write("This is a NER model test application for NER4Archives project. "
-                   "Pass through the analyzer only small samples.")
 # 1. User provides a XML EAD
-streamlit.write("## 1. 📄 Input XML EAD:")
 filename = streamlit.file_uploader("Load an XML EAD", type="xml")
-#with open(filename, mode='r', encoding='utf-8') as f:
-#    file = f.read()
-flag_file = False
 flag_model = False
 if filename is not None:
-    if len(filename.getvalue()) > 0:
         flag_file = True
-#print(file)
 import re
@@ -61,12 +84,10 @@ linking = True
 flag_view = False
 if flag_file:
     col1, col2 = streamlit.columns(2)
-    col1.write("## XML tree view:")
-    col2.write("## Plain text view:")
     parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
-    tree = etree.fromstring(filename.getvalue().decode("utf-8").encode('utf-8'), parser=parser)
-    # representation cool:
-    #input_xml = col1.markdown(f'<pre>{etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")}</pre>', unsafe_allow_html=False)
     xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
     col1.text_area("", value=xml, height=500, disabled=True)
     dids, sentences = ead_strategy(tree)
@@ -75,23 +96,22 @@ if flag_file:
     flag_view = True
 if flag_view:
-    streamlit.write("## 2. ⚙️ Choose NER model and configure:")
     models = []
     for pipe in spacy.info()["pipelines"]:
         models.append(pipe)
     option = streamlit.selectbox(
-        'Choose the NER model you want to apply: ',
         models)
     model = option
     if model != "":
         flag_model = True
-    linking = streamlit.checkbox('Apply named entity linking (entity-fishing component)', value=True)
     linkingicon = "✅️"
     if linking is False:
         linkingicon = "❌"
     streamlit.write("#### Actual Parameters:")
     streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
 @Language.factory("custom_ner", default_config={
     "model_name": "",
     "sentences_to_process": []
@@ -104,6 +124,13 @@ class CustomNer:
                  sentences_to_process: list):
         self.nlp = nlp
         self.pipeline_ner = spacy.load(model_name)
         self.sentences = sentences_to_process
     def __call__(self, doc: Doc):
@@ -132,14 +159,14 @@ entities = []
 flag_vizualize = False
 if flag_model:
-    if streamlit.button('Launch model'):
         with streamlit.spinner('Initialize NER...'):
             huge_pipeline_linking = spacy.blank("fr")
             huge_pipeline_linking.max_length = 5000000
             huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
             if linking:
                 huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
-        with streamlit.spinner('NER processing...'):
             doc = huge_pipeline_linking(plain)
         entities = [
@@ -147,23 +174,25 @@ if flag_model:
             ent.end_char,
             ent.text,
             ent.label_,
-            ent._.url_wikidata
             ) for ent in doc.ents
         ]
-        streamlit.success('🙂 NER applied with success!')
         df = pd.DataFrame(entities, columns=['START',
                                              'END',
                                              'MENTION',
                                              'NER LABEL',
-                                             'WIKIDATA RESSOURCE  (wikidata disambiguation)'
                                              ])
-        streamlit.write("## 3. 🔎 Explore named entities in table: ")
         streamlit.write(df)
-        streamlit.write("## 3. 🔎 Explore named entities in text: ")
         spacy_streamlit.visualize_ner(
             [{"text": doc.text,
              "ents": [
@@ -174,19 +203,23 @@ if flag_model:
                   "kb_url": ent._.url_wikidata
                   } for ent in doc.ents
              ]}],
-            labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE"],
             show_table=False,
             manual=True,
             title="",
         displacy_options={
-            "colors":{
                 "EVENT": "#ec7063",
                 "LOCATION": "#45b39d",
                 "ORGANISATION": "#f39c12",
                 "PERSON": "#3498db",
-                "TITLE": "#a569bd "
             }
         })
-        # streamlit.markdown(f"<h4>Explore entities:<h4><br><div style='overflow:scroll; overflow-y: auto; max-height: 1050px;'>{df.to_html(justify='center')}</div>", unsafe_allow_html=True)

 streamlit.set_page_config(layout="wide")
+samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
 # TITLE APP
 streamlit.title("NER4Archives visualizer")
+streamlit.sidebar.title("NER4Archives visualizer")
+streamlit.sidebar.write("## Motivation")
+streamlit.sidebar.markdown("""<div style="text-align: justify;">
+<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
+XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
+<p>In context of <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
+extracted from XML EAD finding aids and test it on new data.<p>
+<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
+framework and its available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
+Other models may be added in the future.</p>
+<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
+NER4Archives - 2022</div>
+""", unsafe_allow_html=True)
+scol1, scol2 = streamlit.sidebar.columns(2)
+scol1.image("./assets/an.png", width=170)
+scol2.image("./assets/almanach_rouge-inria.png", width=100)
+flag_file = False
 # 1. User provides a XML EAD
+streamlit.write("## 📄 Input XML EAD:")
 filename = streamlit.file_uploader("Load an XML EAD", type="xml")
+streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory")
+data = ""
 flag_model = False
 if filename is not None:
+    data = filename.getvalue().decode("utf-8").encode("utf-8")
+    if len(data) > 0:
         flag_file = True
 import re
 flag_view = False
 if flag_file:
     col1, col2 = streamlit.columns(2)
+    col1.write("## 👁️ XML tree view:")
+    col2.write("## 👁️ Plain text view:")
     parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
+    tree = etree.fromstring(data, parser=parser)
     xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
     col1.text_area("", value=xml, height=500, disabled=True)
     dids, sentences = ead_strategy(tree)
     flag_view = True
 if flag_view:
+    streamlit.write("## ⚙️ Configure NER model and options:")
     models = []
     for pipe in spacy.info()["pipelines"]:
         models.append(pipe)
     option = streamlit.selectbox(
+        'Choose a NER model you want to apply in the list: ',
         models)
     model = option
     if model != "":
         flag_model = True
+    linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
     linkingicon = "✅️"
     if linking is False:
         linkingicon = "❌"
     streamlit.write("#### Actual Parameters:")
     streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
 @Language.factory("custom_ner", default_config={
     "model_name": "",
     "sentences_to_process": []
                  sentences_to_process: list):
         self.nlp = nlp
         self.pipeline_ner = spacy.load(model_name)
+        f_score = self.pipeline_ner.meta['performance']['ents_f']
+        recall  = self.pipeline_ner.meta['performance']['ents_r']
+        precision = self.pipeline_ner.meta['performance']['ents_p']
+        mcol1, mcol2, mcol3 = streamlit.columns(3)
+        mcol1.metric("F-Score", f'{f_score:.2f}')
+        mcol2.metric("Precision", f'{precision:.2f}')
+        mcol3.metric("Recall", f'{recall:.2f}')
         self.sentences = sentences_to_process
     def __call__(self, doc: Doc):
 flag_vizualize = False
 if flag_model:
+    if streamlit.button('Launch'):
         with streamlit.spinner('Initialize NER...'):
             huge_pipeline_linking = spacy.blank("fr")
             huge_pipeline_linking.max_length = 5000000
             huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
             if linking:
                 huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
+        with streamlit.spinner('NER processing... (please, wait depends on data size)'):
             doc = huge_pipeline_linking(plain)
         entities = [
             ent.end_char,
             ent.text,
             ent.label_,
+            ent._.url_wikidata,
+             ent._.nerd_score
             ) for ent in doc.ents
         ]
+        streamlit.success('😃 NER applied with success!')
         df = pd.DataFrame(entities, columns=['START',
                                              'END',
                                              'MENTION',
                                              'NER LABEL',
+                                             'WIKIDATA RESSOURCE  (wikidata disambiguation)',
+                                             'LINKING SCORE'
                                              ])
+        streamlit.write("## 🔎 Explore named entities in table: ")
         streamlit.write(df)
+        streamlit.write("## 🔎 Explore named entities in text: ")
         spacy_streamlit.visualize_ner(
             [{"text": doc.text,
              "ents": [
                   "kb_url": ent._.url_wikidata
                   } for ent in doc.ents
              ]}],
+            labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
             show_table=False,
             manual=True,
             title="",
         displacy_options={
+            "colors": {
                 "EVENT": "#ec7063",
                 "LOCATION": "#45b39d",
                 "ORGANISATION": "#f39c12",
                 "PERSON": "#3498db",
+                "TITLE": "#a569bd ",
+                "LOC": "#45b39d",
+                "MISC": "#ec7063",
+                "ORG": "#f39c12",
+                "PER": "#3498db"
             }
         })

requirements.txt CHANGED Viewed

@@ -23,6 +23,7 @@ defusedxml==0.7.1
 entrypoints==0.4
 executing==0.9.1
 fastjsonschema==2.16.1
 fr-ner4archives-default-test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
 gitdb==4.0.9
 GitPython==3.1.27

 entrypoints==0.4
 executing==0.9.1
 fastjsonschema==2.16.1
+fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0-py3-none-any.whl
 fr-ner4archives-default-test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
 gitdb==4.0.9
 GitPython==3.1.27

samples/FRAN_IR_050370.xml ADDED Viewed

The diff for this file is too large to render. See raw diff