lterriel commited on
Commit
af23812
Β·
1 Parent(s): e15cd3b
Files changed (4) hide show
  1. .gitattributes +3 -0
  2. app.py +63 -30
  3. requirements.txt +1 -0
  4. samples/FRAN_IR_050370.xml +0 -0
.gitattributes CHANGED
@@ -29,3 +29,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ *.png filter=lfs diff=lfs merge=lfs -text
33
+ assets/*.png filter=lfs diff=lfs merge=lfs -text
34
+ assets filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -8,27 +8,50 @@ from spacy.tokens import Doc
8
 
9
  streamlit.set_page_config(layout="wide")
10
 
11
- default_text = "c'est un texte de test"
12
 
13
  # TITLE APP
14
  streamlit.title("NER4Archives visualizer")
 
 
 
 
 
15
 
16
- # DESCRIPTION APP
17
- streamlit.write("This is a NER model test application for NER4Archives project. "
18
- "Pass through the analyzer only small samples.")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  # 1. User provides a XML EAD
22
- streamlit.write("## 1. πŸ“„ Input XML EAD:")
23
  filename = streamlit.file_uploader("Load an XML EAD", type="xml")
24
- #with open(filename, mode='r', encoding='utf-8') as f:
25
- # file = f.read()
26
- flag_file = False
 
 
 
27
  flag_model = False
28
  if filename is not None:
29
- if len(filename.getvalue()) > 0:
 
30
  flag_file = True
31
- #print(file)
 
32
 
33
 
34
  import re
@@ -61,12 +84,10 @@ linking = True
61
  flag_view = False
62
  if flag_file:
63
  col1, col2 = streamlit.columns(2)
64
- col1.write("## XML tree view:")
65
- col2.write("## Plain text view:")
66
  parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
67
- tree = etree.fromstring(filename.getvalue().decode("utf-8").encode('utf-8'), parser=parser)
68
- # representation cool:
69
- #input_xml = col1.markdown(f'<pre>{etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")}</pre>', unsafe_allow_html=False)
70
  xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
71
  col1.text_area("", value=xml, height=500, disabled=True)
72
  dids, sentences = ead_strategy(tree)
@@ -75,23 +96,22 @@ if flag_file:
75
  flag_view = True
76
 
77
  if flag_view:
78
- streamlit.write("## 2. βš™οΈ Choose NER model and configure:")
79
  models = []
80
  for pipe in spacy.info()["pipelines"]:
81
  models.append(pipe)
82
  option = streamlit.selectbox(
83
- 'Choose the NER model you want to apply: ',
84
  models)
85
  model = option
86
  if model != "":
87
  flag_model = True
88
- linking = streamlit.checkbox('Apply named entity linking (entity-fishing component)', value=True)
89
  linkingicon = "βœ…οΈ"
90
  if linking is False:
91
  linkingicon = "❌"
92
  streamlit.write("#### Actual Parameters:")
93
  streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
94
-
95
  @Language.factory("custom_ner", default_config={
96
  "model_name": "",
97
  "sentences_to_process": []
@@ -104,6 +124,13 @@ class CustomNer:
104
  sentences_to_process: list):
105
  self.nlp = nlp
106
  self.pipeline_ner = spacy.load(model_name)
 
 
 
 
 
 
 
107
  self.sentences = sentences_to_process
108
 
109
  def __call__(self, doc: Doc):
@@ -132,14 +159,14 @@ entities = []
132
  flag_vizualize = False
133
 
134
  if flag_model:
135
- if streamlit.button('Launch model'):
136
  with streamlit.spinner('Initialize NER...'):
137
  huge_pipeline_linking = spacy.blank("fr")
138
  huge_pipeline_linking.max_length = 5000000
139
  huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
140
  if linking:
141
  huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
142
- with streamlit.spinner('NER processing...'):
143
  doc = huge_pipeline_linking(plain)
144
 
145
  entities = [
@@ -147,23 +174,25 @@ if flag_model:
147
  ent.end_char,
148
  ent.text,
149
  ent.label_,
150
- ent._.url_wikidata
 
151
  ) for ent in doc.ents
152
  ]
153
- streamlit.success('πŸ™‚ NER applied with success!')
154
 
155
 
156
  df = pd.DataFrame(entities, columns=['START',
157
  'END',
158
  'MENTION',
159
  'NER LABEL',
160
- 'WIKIDATA RESSOURCE (wikidata disambiguation)'
 
161
  ])
162
 
163
- streamlit.write("## 3. πŸ”Ž Explore named entities in table: ")
164
  streamlit.write(df)
165
 
166
- streamlit.write("## 3. πŸ”Ž Explore named entities in text: ")
167
  spacy_streamlit.visualize_ner(
168
  [{"text": doc.text,
169
  "ents": [
@@ -174,19 +203,23 @@ if flag_model:
174
  "kb_url": ent._.url_wikidata
175
  } for ent in doc.ents
176
  ]}],
177
- labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE"],
178
  show_table=False,
179
  manual=True,
180
  title="",
181
  displacy_options={
182
- "colors":{
183
  "EVENT": "#ec7063",
184
  "LOCATION": "#45b39d",
185
  "ORGANISATION": "#f39c12",
186
  "PERSON": "#3498db",
187
- "TITLE": "#a569bd "
 
 
 
 
 
188
  }
189
  })
190
- # streamlit.markdown(f"<h4>Explore entities:<h4><br><div style='overflow:scroll; overflow-y: auto; max-height: 1050px;'>{df.to_html(justify='center')}</div>", unsafe_allow_html=True)
191
 
192
 
 
8
 
9
  streamlit.set_page_config(layout="wide")
10
 
11
+ samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
12
 
13
  # TITLE APP
14
  streamlit.title("NER4Archives visualizer")
15
+ streamlit.sidebar.title("NER4Archives visualizer")
16
+ streamlit.sidebar.write("## Motivation")
17
+ streamlit.sidebar.markdown("""<div style="text-align: justify;">
18
+ <p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
19
+ XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
20
 
21
+ <p>In context of <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
22
+ extracted from XML EAD finding aids and test it on new data.<p>
 
23
 
24
+ <p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
25
+ framework and its available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
26
+ Other models may be added in the future.</p>
27
+
28
+ <p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
29
+
30
+ NER4Archives - 2022</div>
31
+ """, unsafe_allow_html=True)
32
+
33
+ scol1, scol2 = streamlit.sidebar.columns(2)
34
+ scol1.image("./assets/an.png", width=170)
35
+ scol2.image("./assets/almanach_rouge-inria.png", width=100)
36
+
37
+ flag_file = False
38
 
39
  # 1. User provides a XML EAD
40
+ streamlit.write("## πŸ“„ Input XML EAD:")
41
  filename = streamlit.file_uploader("Load an XML EAD", type="xml")
42
+ streamlit.markdown("or use a XML EAD provided in [`samples/`](./samples) directory")
43
+ data = ""
44
+
45
+
46
+
47
+
48
  flag_model = False
49
  if filename is not None:
50
+ data = filename.getvalue().decode("utf-8").encode("utf-8")
51
+ if len(data) > 0:
52
  flag_file = True
53
+
54
+
55
 
56
 
57
  import re
 
84
  flag_view = False
85
  if flag_file:
86
  col1, col2 = streamlit.columns(2)
87
+ col1.write("## πŸ‘οΈ XML tree view:")
88
+ col2.write("## πŸ‘οΈ Plain text view:")
89
  parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
90
+ tree = etree.fromstring(data, parser=parser)
 
 
91
  xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
92
  col1.text_area("", value=xml, height=500, disabled=True)
93
  dids, sentences = ead_strategy(tree)
 
96
  flag_view = True
97
 
98
  if flag_view:
99
+ streamlit.write("## βš™οΈ Configure NER model and options:")
100
  models = []
101
  for pipe in spacy.info()["pipelines"]:
102
  models.append(pipe)
103
  option = streamlit.selectbox(
104
+ 'Choose a NER model you want to apply in the list: ',
105
  models)
106
  model = option
107
  if model != "":
108
  flag_model = True
109
+ linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
110
  linkingicon = "βœ…οΈ"
111
  if linking is False:
112
  linkingicon = "❌"
113
  streamlit.write("#### Actual Parameters:")
114
  streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
 
115
  @Language.factory("custom_ner", default_config={
116
  "model_name": "",
117
  "sentences_to_process": []
 
124
  sentences_to_process: list):
125
  self.nlp = nlp
126
  self.pipeline_ner = spacy.load(model_name)
127
+ f_score = self.pipeline_ner.meta['performance']['ents_f']
128
+ recall = self.pipeline_ner.meta['performance']['ents_r']
129
+ precision = self.pipeline_ner.meta['performance']['ents_p']
130
+ mcol1, mcol2, mcol3 = streamlit.columns(3)
131
+ mcol1.metric("F-Score", f'{f_score:.2f}')
132
+ mcol2.metric("Precision", f'{precision:.2f}')
133
+ mcol3.metric("Recall", f'{recall:.2f}')
134
  self.sentences = sentences_to_process
135
 
136
  def __call__(self, doc: Doc):
 
159
  flag_vizualize = False
160
 
161
  if flag_model:
162
+ if streamlit.button('Launch'):
163
  with streamlit.spinner('Initialize NER...'):
164
  huge_pipeline_linking = spacy.blank("fr")
165
  huge_pipeline_linking.max_length = 5000000
166
  huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
167
  if linking:
168
  huge_pipeline_linking.add_pipe('entityfishing', config={"language": "fr"})
169
+ with streamlit.spinner('NER processing... (please, wait depends on data size)'):
170
  doc = huge_pipeline_linking(plain)
171
 
172
  entities = [
 
174
  ent.end_char,
175
  ent.text,
176
  ent.label_,
177
+ ent._.url_wikidata,
178
+ ent._.nerd_score
179
  ) for ent in doc.ents
180
  ]
181
+ streamlit.success('πŸ˜ƒ NER applied with success!')
182
 
183
 
184
  df = pd.DataFrame(entities, columns=['START',
185
  'END',
186
  'MENTION',
187
  'NER LABEL',
188
+ 'WIKIDATA RESSOURCE (wikidata disambiguation)',
189
+ 'LINKING SCORE'
190
  ])
191
 
192
+ streamlit.write("## πŸ”Ž Explore named entities in table: ")
193
  streamlit.write(df)
194
 
195
+ streamlit.write("## πŸ”Ž Explore named entities in text: ")
196
  spacy_streamlit.visualize_ner(
197
  [{"text": doc.text,
198
  "ents": [
 
203
  "kb_url": ent._.url_wikidata
204
  } for ent in doc.ents
205
  ]}],
206
+ labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
207
  show_table=False,
208
  manual=True,
209
  title="",
210
  displacy_options={
211
+ "colors": {
212
  "EVENT": "#ec7063",
213
  "LOCATION": "#45b39d",
214
  "ORGANISATION": "#f39c12",
215
  "PERSON": "#3498db",
216
+ "TITLE": "#a569bd ",
217
+ "LOC": "#45b39d",
218
+ "MISC": "#ec7063",
219
+ "ORG": "#f39c12",
220
+ "PER": "#3498db"
221
+
222
  }
223
  })
 
224
 
225
 
requirements.txt CHANGED
@@ -23,6 +23,7 @@ defusedxml==0.7.1
23
  entrypoints==0.4
24
  executing==0.9.1
25
  fastjsonschema==2.16.1
 
26
  fr-ner4archives-default-test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
27
  gitdb==4.0.9
28
  GitPython==3.1.27
 
23
  entrypoints==0.4
24
  executing==0.9.1
25
  fastjsonschema==2.16.1
26
+ fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.3.0/fr_core_news_sm-3.3.0-py3-none-any.whl
27
  fr-ner4archives-default-test @ https://huggingface.co/ner4archives/fr_ner4archives_default_test/resolve/main/fr_ner4archives_default_test-any-py3-none-any.whl
28
  gitdb==4.0.9
29
  GitPython==3.1.27
samples/FRAN_IR_050370.xml ADDED
The diff for this file is too large to render. See raw diff