Spaces:
Sleeping
Sleeping
Upload 9 files
Browse files- app.py +773 -0
- requirements.txt +5 -0
- static/Logo_cnrs.png +0 -0
- static/logo_inrap.png +0 -0
- static/logo_ir_metopes.png +0 -0
- static/logo_mesr.png +0 -0
- static/logo_mrsh.jpg +0 -0
- static/logo_ouvrir_la_science.png +0 -0
- static/logo_unicaen.png +0 -0
app.py
ADDED
@@ -0,0 +1,773 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import spacy
|
3 |
+
from spacy import displacy
|
4 |
+
import pandas as pd
|
5 |
+
from io import StringIO, BytesIO
|
6 |
+
from lxml import etree
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import html
|
9 |
+
from streamlit_extras.stylable_container import stylable_container
|
10 |
+
import subprocess
|
11 |
+
|
12 |
+
|
13 |
+
# ===== SOME SETTING UP =====
|
14 |
+
|
15 |
+
# Setting up the app's page
|
16 |
+
st.set_page_config(page_title="ARCHES | Visualisation & extraction", page_icon="🏺")
|
17 |
+
|
18 |
+
# Path to the statics directory
|
19 |
+
statics = "./static"
|
20 |
+
|
21 |
+
# Making the radio widgets' titles bigger
|
22 |
+
st.markdown(
|
23 |
+
"""<style>
|
24 |
+
div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
|
25 |
+
font-size: 17px;
|
26 |
+
}
|
27 |
+
</style>
|
28 |
+
""", unsafe_allow_html=True)
|
29 |
+
|
30 |
+
# Hiding the possibility to display pictures fullscreen
|
31 |
+
st.markdown(
|
32 |
+
"""<style>
|
33 |
+
button[title="View fullscreen"]{
|
34 |
+
visibility: hidden;
|
35 |
+
}
|
36 |
+
</style>
|
37 |
+
""", unsafe_allow_html=True)
|
38 |
+
|
39 |
+
# Setting up the colors of the entity tags for displacy
|
40 |
+
ENTITIES_COLORS = {
|
41 |
+
"CHRONOLOGIE": "#ffb627",
|
42 |
+
"MOBILIER": "#6b7fd7",
|
43 |
+
"MATERIAU": "#d36582",
|
44 |
+
"STRUCTURE": "#00b2ca",
|
45 |
+
"TECHNIQUE_STYLE": "#ED6A5A",
|
46 |
+
"ESPECE": "#96C7FF",
|
47 |
+
"EDIFICE": "#9F86C0",
|
48 |
+
"ID": "#f65bff",
|
49 |
+
"LIEUDIT_SITE": "#d8e446",
|
50 |
+
"PERSONNE": "#D3B4B4",
|
51 |
+
"PEUPLE_CULTURE": "#d20000",
|
52 |
+
"LOC": "#81db72",
|
53 |
+
"DECOR": "#fff46a",
|
54 |
+
"ORG": "#887575",
|
55 |
+
"GPE": "#00a878"
|
56 |
+
}
|
57 |
+
OPTIONS = {
|
58 |
+
"ents":
|
59 |
+
[
|
60 |
+
"CHRONOLOGIE",
|
61 |
+
"MOBILIER",
|
62 |
+
"MATERIAU",
|
63 |
+
"STRUCTURE",
|
64 |
+
"TECHNIQUE_STYLE",
|
65 |
+
"ESPECE",
|
66 |
+
"EDIFICE",
|
67 |
+
"ID",
|
68 |
+
"LIEUDIT_SITE",
|
69 |
+
"PERSONNE",
|
70 |
+
"PEUPLE_CULTURE",
|
71 |
+
"LOC",
|
72 |
+
"DECOR",
|
73 |
+
"ORG",
|
74 |
+
"GPE"
|
75 |
+
],
|
76 |
+
"colors": ENTITIES_COLORS}
|
77 |
+
|
78 |
+
# ===== SIDEBAR =====
|
79 |
+
|
80 |
+
st.sidebar.title("ARCHES - Étude, composition et processus pour une édition structurée des rapports d’opérations archéologiques préventives")
|
81 |
+
|
82 |
+
st.sidebar.markdown("Avec ses 2200 collaborateurs, l’[Inrap](https://www.inrap.fr/) représente la plus importante structure publique de recherche archéologique française. De fait, chaque année, près de 2000 chantiers (diagnostics archéologiques et fouilles) sont réalisés en partenariat avec les aménageurs publics et privés, en France métropolitaine et dans les départements d’outre-mer. Les missions de l’Institut intégrant l’exploitation scientifique des résultats et la diffusion de la connaissance archéologique auprès du public, plus de 2000 rapports d’opération archéologique sont ainsi rédigés annuellement.")
|
83 |
+
|
84 |
+
st.sidebar.markdown("Financé avec le soutien du [Fonds National pour la Science Ouverte](https://www.ouvrirlascience.fr/accueil/) et réalisé en collaboration avec l’infrastructure de recherche [Métopes](http://www.metopes.fr/) ([Université de Caen Normandie](https://www.unicaen.fr/) - [CNRS](https://www.cnrs.fr/fr)), [ARCHES](https://www.inrap.fr/arches-etude-composition-et-processus-pour-une-edition-structuree-des-rapports-d-17145) vise à explorer l’amélioration de la diffusion et de l’exploitation des rapports d’opération à l’aide du format de balisage XML-TEI, permettant d’encoder tant la structuration formelle que le contenu sémantique d’un document. Dans cette optique, vingt-et-un rapports de fouilles de l’Inrap ont été annotés pour entraîner un modèle de reconnaissance des entités nommées (représentant plus de 80 000 entités annotées). Cette application vise à tester la manipulation du modèle, tant avec des fichiers XML que texte brut.")
|
85 |
+
|
86 |
+
st.sidebar.write("")
|
87 |
+
|
88 |
+
st.sidebar.header("Partenaires")
|
89 |
+
|
90 |
+
# Display logos
|
91 |
+
col1, col2, col3 = st.sidebar.columns(3)
|
92 |
+
col1.image(f"{statics}/logo_inrap.png", width=100)
|
93 |
+
col2.write("")
|
94 |
+
col2.image(f"{statics}/logo_ouvrir_la_science.png", width=100)
|
95 |
+
col3.image(f"{statics}/logo_mesr.png", width=100)
|
96 |
+
col1.image(f"{statics}/logo_ir_metopes.png", width=100)
|
97 |
+
col2.write("")
|
98 |
+
col2.write("")
|
99 |
+
col2.image(f"{statics}/logo_mrsh.jpg", width=100)
|
100 |
+
col3.image(f"{statics}/logo_unicaen.png", width=100)
|
101 |
+
col1.image(f"{statics}/logo_cnrs.png", width=80)
|
102 |
+
|
103 |
+
# ===== SOME FUNCTIONS =====
|
104 |
+
|
105 |
+
|
106 |
+
# Cached to prevent computation on every rerun
|
107 |
+
@st.cache_resource
|
108 |
+
def download_sentencizer():
|
109 |
+
"""
|
110 |
+
Downloads the fr_arches_sentencizer model.
|
111 |
+
|
112 |
+
:returns: None
|
113 |
+
"""
|
114 |
+
|
115 |
+
subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches_sentencizer/resolve/main/fr_arches_sentencizer-any-py3-none-any.whl"])
|
116 |
+
|
117 |
+
|
118 |
+
# Cached to prevent computation on every rerun
|
119 |
+
@st.cache_resource
|
120 |
+
def download_ner():
|
121 |
+
"""
|
122 |
+
Downloads the fr_arches TRF NER model.
|
123 |
+
|
124 |
+
:returns: None
|
125 |
+
"""
|
126 |
+
|
127 |
+
subprocess.run(["pip", "install", "https://huggingface.co/a-menu/fr_arches/resolve/main/fr_arches-any-py3-none-any.whl"])
|
128 |
+
|
129 |
+
|
130 |
+
# Cached to prevent computation on every rerun
|
131 |
+
@st.cache_resource
|
132 |
+
def load_sentencizer():
|
133 |
+
"""
|
134 |
+
Loads our custom sentence segmentation model.
|
135 |
+
|
136 |
+
:returns: loaded fr_arches_sentencizer model
|
137 |
+
:rtype: spacy.lang.fr.French
|
138 |
+
"""
|
139 |
+
|
140 |
+
senter = spacy.load("fr_arches_sentencizer")
|
141 |
+
return senter
|
142 |
+
|
143 |
+
|
144 |
+
# Cached to prevent computation on every rerun
|
145 |
+
@st.cache_resource
|
146 |
+
def load_ner():
|
147 |
+
"""
|
148 |
+
Loads our custom fr_arches trf ner model.
|
149 |
+
|
150 |
+
:returns: loaded fr_arches model
|
151 |
+
:rtype: spacy.lang.fr.French
|
152 |
+
"""
|
153 |
+
|
154 |
+
ner = spacy.load("fr_arches")
|
155 |
+
return ner
|
156 |
+
|
157 |
+
|
158 |
+
def apply_senter(senter, data):
|
159 |
+
"""
|
160 |
+
Applies our custom sentence segmentation model on data.
|
161 |
+
|
162 |
+
:param senter: sentence segmentation model
|
163 |
+
:type senter: spacy.lang.fr.French
|
164 |
+
:param data: text to be segmented
|
165 |
+
:type data: str
|
166 |
+
:returns: sentencized text
|
167 |
+
:rtype: str
|
168 |
+
"""
|
169 |
+
|
170 |
+
mes_phrases = senter(data)
|
171 |
+
sentencized_text = ""
|
172 |
+
for sent in mes_phrases.sents:
|
173 |
+
sentencized_text += str(sent) + "\n"
|
174 |
+
return sentencized_text
|
175 |
+
|
176 |
+
|
177 |
+
def get_doc(ner, data):
|
178 |
+
"""
|
179 |
+
Applies our custom ner model on data.
|
180 |
+
|
181 |
+
:param ner: ner model
|
182 |
+
:type ner: spacy.lang.fr.French
|
183 |
+
:param data: text to be analyzed
|
184 |
+
:type data: str
|
185 |
+
:returns: spacy doc
|
186 |
+
:rtype: spacy.tokens.doc.Doc
|
187 |
+
"""
|
188 |
+
|
189 |
+
doc = ner(data)
|
190 |
+
return doc
|
191 |
+
|
192 |
+
|
193 |
+
def get_entities(doc):
|
194 |
+
"""
|
195 |
+
Extracts the named entities from the doc.
|
196 |
+
|
197 |
+
:param doc: spacy doc
|
198 |
+
:type doc: spacy.tokens.doc.Doc
|
199 |
+
:returns: list of named entities
|
200 |
+
:rtype: list
|
201 |
+
"""
|
202 |
+
|
203 |
+
entities = []
|
204 |
+
for ent in doc.ents:
|
205 |
+
# Join tokens with or without space based on the preceding token
|
206 |
+
filtered_tokens = ""
|
207 |
+
for i, token in enumerate(ent):
|
208 |
+
# Skip newline tokens within the entity
|
209 |
+
if token.text == "\n":
|
210 |
+
continue
|
211 |
+
# If the last character of the previous token is an apostrophe, join the next token without a space
|
212 |
+
if i > 0 and ent[i - 1].text[-1] == "’" or ent[i - 1].text[-1] == "'":
|
213 |
+
filtered_tokens += token.text
|
214 |
+
# Join them with a space otherwise
|
215 |
+
else:
|
216 |
+
filtered_tokens += " " + token.text if i > 0 else token.text
|
217 |
+
|
218 |
+
# Collect the start and end character positions for the entire entity
|
219 |
+
start_char = ent.start_char
|
220 |
+
# Correct the end position (useful when a token == \n for instance)
|
221 |
+
difference = len(ent.text) - len(filtered_tokens)
|
222 |
+
end_char = ent.end_char - difference
|
223 |
+
|
224 |
+
entities.append((filtered_tokens.strip(), ent.label_, start_char, end_char))
|
225 |
+
|
226 |
+
return entities
|
227 |
+
|
228 |
+
|
229 |
+
def create_displacy(text, entities):
|
230 |
+
"""
|
231 |
+
Render named entities using displacy.
|
232 |
+
|
233 |
+
:param text: input text
|
234 |
+
:type text: str
|
235 |
+
:param entities: list of named entities with start and end character positions
|
236 |
+
:type entities: list
|
237 |
+
:returns: showcase of entities with displacy
|
238 |
+
:rtype: str
|
239 |
+
"""
|
240 |
+
|
241 |
+
# Prepare data for displacy
|
242 |
+
entity_data = [{"start": ent[2], "end": ent[3], "label": ent[1]} for ent in entities]
|
243 |
+
# Render using displacy
|
244 |
+
my_displacy = displacy.render([{"text": text, "ents": entity_data}], style="ent", options=OPTIONS, manual=True)
|
245 |
+
return my_displacy
|
246 |
+
|
247 |
+
|
248 |
+
def create_df(entities):
|
249 |
+
"""
|
250 |
+
Creates a dataframe to display the named entities found in text.
|
251 |
+
|
252 |
+
:param entities: named entities
|
253 |
+
:type entities: list
|
254 |
+
:returns: dataframe
|
255 |
+
:rtype: pd.DataFrame
|
256 |
+
"""
|
257 |
+
|
258 |
+
df = pd.DataFrame(entities, columns=["ENTITE",
|
259 |
+
"LABEL",
|
260 |
+
"DEBUT",
|
261 |
+
"FIN"
|
262 |
+
])
|
263 |
+
return df
|
264 |
+
|
265 |
+
|
266 |
+
def df_to_csv(df_to_convert):
|
267 |
+
"""
|
268 |
+
Converts df to csv.
|
269 |
+
|
270 |
+
:param df_to_convert: dataframe to be converted to csv
|
271 |
+
:type df_to_convert: pd.DataFrame
|
272 |
+
:returns: csv
|
273 |
+
:rtype: csv
|
274 |
+
"""
|
275 |
+
|
276 |
+
return df_to_convert.to_csv(encoding="utf-8")
|
277 |
+
|
278 |
+
|
279 |
+
def doc_to_conll(doc, updated_name=False):
|
280 |
+
"""
|
281 |
+
Converts a doc and its entities to a conll2002 file.
|
282 |
+
|
283 |
+
:param doc: spacy doc
|
284 |
+
:type doc: spacy.tokens.doc.Doc
|
285 |
+
:param updated_name: should the name of the downloaded file be updated?
|
286 |
+
:type updated_name: bool
|
287 |
+
:returns: button to download the conll2002 file
|
288 |
+
:rtype: streamlit.components.v1.components.download_button.DownloadButtonMixin
|
289 |
+
"""
|
290 |
+
|
291 |
+
# Writing to a BytesIO object to get the byte content
|
292 |
+
with BytesIO() as sortie_buffer:
|
293 |
+
for tok in doc:
|
294 |
+
# Convert a named entity to conll2002
|
295 |
+
if tok.ent_type and tok.text != "\n":
|
296 |
+
sortie_buffer.write(f"{tok.text} {tok.ent_iob_}-{tok.ent_type_}\n".encode("utf-8"))
|
297 |
+
# Convert a token without a named entity to conll2002
|
298 |
+
else:
|
299 |
+
if tok.text != "\n" and tok.ent_iob_:
|
300 |
+
sortie_buffer.write(f"{tok.text} {tok.ent_iob_}\n".encode("utf-8"))
|
301 |
+
# Write a single empty line for each new line in the original text
|
302 |
+
else:
|
303 |
+
sortie_buffer.write(b"\n")
|
304 |
+
|
305 |
+
# Move the buffer position to the beginning for reading
|
306 |
+
sortie_buffer.seek(0)
|
307 |
+
|
308 |
+
# Check if the buffer has a line only consisting of "O\n" and delete it
|
309 |
+
buffer_content = sortie_buffer.getvalue().decode("utf-8")
|
310 |
+
lines = buffer_content.split("\n")
|
311 |
+
modified_lines = [line for line in lines if line.strip() != "O"]
|
312 |
+
modified_buffer_content = "\n".join(modified_lines)
|
313 |
+
|
314 |
+
# Write the modified content back to the buffer
|
315 |
+
sortie_buffer.seek(0)
|
316 |
+
sortie_buffer.write(modified_buffer_content.encode("utf-8"))
|
317 |
+
|
318 |
+
# Move the buffer position to the beginning for reading
|
319 |
+
sortie_buffer.seek(0)
|
320 |
+
|
321 |
+
# If we have an uploaded file: update the name of the exported file.
|
322 |
+
if updated_name:
|
323 |
+
my_button = st.download_button(
|
324 |
+
label="Télécharger le fichier CoNLL2002",
|
325 |
+
data=sortie_buffer,
|
326 |
+
file_name=updated_name + ".conll"
|
327 |
+
)
|
328 |
+
# If we have no uploaded file ('example on the go' mode): use a default name for the exported file.
|
329 |
+
else:
|
330 |
+
my_button = st.download_button(
|
331 |
+
label="Télécharger le fichier CoNLL2002",
|
332 |
+
data=sortie_buffer,
|
333 |
+
file_name="prediction_arches.conll"
|
334 |
+
)
|
335 |
+
|
336 |
+
return my_button
|
337 |
+
|
338 |
+
|
339 |
+
def get_body_text(xml_input):
|
340 |
+
"""
|
341 |
+
Parses an xml file and returns its <body>.
|
342 |
+
|
343 |
+
:param xml_input: xml file to be parsed
|
344 |
+
:type xml_input: str
|
345 |
+
:returns: the <body> if successful, None otherwise
|
346 |
+
:rtype: str or None
|
347 |
+
"""
|
348 |
+
|
349 |
+
try:
|
350 |
+
# Parse XML content
|
351 |
+
parser = etree.XMLParser(recover=True)
|
352 |
+
root = etree.fromstring(xml_input, parser=parser)
|
353 |
+
# Find <body> element in the XML namespace
|
354 |
+
body = root.xpath("//tei:body", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})
|
355 |
+
|
356 |
+
if body:
|
357 |
+
body_element = body[0] if len(body) == 1 else None
|
358 |
+
|
359 |
+
if len(body_element) > 0:
|
360 |
+
# Extract the text content
|
361 |
+
body_soup = BeautifulSoup(etree.tostring(body_element), "html.parser")
|
362 |
+
body_text = body_soup.get_text(separator=" ", strip=True)
|
363 |
+
return body_text
|
364 |
+
else:
|
365 |
+
st.warning("L'élément <body> est vide.")
|
366 |
+
return None
|
367 |
+
else:
|
368 |
+
st.warning("Aucun élément <body> n'a été détecté dans le fichier XML.")
|
369 |
+
return None
|
370 |
+
|
371 |
+
except etree.XMLSyntaxError:
|
372 |
+
st.warning("Format XML incorrect. Veuillez importer un fichier XML valide.")
|
373 |
+
return None
|
374 |
+
|
375 |
+
|
376 |
+
def xml_mapping(entity, label):
|
377 |
+
"""
|
378 |
+
Create an XML element based on an entity's given label.
|
379 |
+
|
380 |
+
:param entity: entity text
|
381 |
+
:type entity: str
|
382 |
+
:param label: entity label
|
383 |
+
:type label: str
|
384 |
+
:returns: custom XML element if successful, default <name> element if not
|
385 |
+
:rtype: etree.Element
|
386 |
+
"""
|
387 |
+
|
388 |
+
element_mapping = {
|
389 |
+
"CHRONOLOGIE": {"tag": "date"},
|
390 |
+
"DECOR": {"tag": "name", "attrib": {"type": "decor"}},
|
391 |
+
"EDIFICE": {"tag": "name", "attrib": {"type": "edifice"}},
|
392 |
+
"ESPECE": {"tag": "name", "attrib": {"type": "espece"}},
|
393 |
+
"GPE": {"tag": "placeName"},
|
394 |
+
"ID": {"tag": "name", "attrib": {"type": "id"}},
|
395 |
+
"LIEUDIT_SITE": {"tag": "placeName", "attrib": {"type": "lieudit_site"}},
|
396 |
+
"LOC": {"tag": "geogName"},
|
397 |
+
"MATERIAU": {"tag": "material"},
|
398 |
+
"MOBILIER": {"tag": "objectName"},
|
399 |
+
"ORG": {"tag": "orgName"},
|
400 |
+
"PERSONNE": {"tag": "persName"},
|
401 |
+
"PEUPLE_CULTURE": {"tag": "orgName", "attrib": {"type": "peuple_culture"}},
|
402 |
+
"STRUCTURE": {"tag": "name", "attrib": {"type": "structure"}},
|
403 |
+
"TECHNIQUE_STYLE": {"tag": "name", "attrib": {"type": "technique_style"}},
|
404 |
+
}
|
405 |
+
|
406 |
+
mapping = element_mapping.get(label)
|
407 |
+
|
408 |
+
if mapping:
|
409 |
+
xml_tag = etree.Element(mapping["tag"], attrib=mapping.get("attrib", {}))
|
410 |
+
xml_tag.text = entity
|
411 |
+
return xml_tag
|
412 |
+
# If the mapping is impossible, encode the entity with a default <name type="generique">
|
413 |
+
else:
|
414 |
+
st.warning(f"Mapping introuvable pour le label : {label}. Entité encodée par conséquence comme : <name type=\"generique\">.")
|
415 |
+
return etree.Element("name", attrib={"type": "generique"})
|
416 |
+
|
417 |
+
|
418 |
+
def entities_to_xml(xml_content, ner):
|
419 |
+
"""
|
420 |
+
Process XML content by replacing identified entities with XML elements.
|
421 |
+
|
422 |
+
:param xml_content: original xml content
|
423 |
+
:type xml_content: str
|
424 |
+
:param ner: ner model
|
425 |
+
:type ner: spacy.lang.fr.French
|
426 |
+
:returns: modified XML content if successful, None otherwise
|
427 |
+
:rtype: str or None
|
428 |
+
"""
|
429 |
+
|
430 |
+
try:
|
431 |
+
# Parse XML content
|
432 |
+
parser = etree.XMLParser(recover=True)
|
433 |
+
root = etree.fromstring(xml_content, parser=parser)
|
434 |
+
# Find <body> element in the XML namespace
|
435 |
+
body = root.xpath("//tei:body", namespaces={"tei": "http://www.tei-c.org/ns/1.0"})
|
436 |
+
|
437 |
+
if body:
|
438 |
+
body_element = body[0] if len(body) == 1 else None
|
439 |
+
|
440 |
+
if len(body_element) > 0:
|
441 |
+
|
442 |
+
# Strip the <body> of the <hi> tags
|
443 |
+
etree.strip_tags(body_element, "{http://www.tei-c.org/ns/1.0}hi")
|
444 |
+
# Get the <body>'s descendants
|
445 |
+
descendants = body_element.xpath(".//*")
|
446 |
+
|
447 |
+
# Iterate through all descendants in the <body>
|
448 |
+
for descendant in descendants:
|
449 |
+
|
450 |
+
if descendant.text:
|
451 |
+
# Apply ner model on the text of the descendant
|
452 |
+
doc = get_doc(ner, descendant.text)
|
453 |
+
# Make a reversed list of the entities based on their position
|
454 |
+
entities = get_entities(doc)
|
455 |
+
entities.sort(key=lambda ent: ent[2], reverse=True)
|
456 |
+
|
457 |
+
for ent in entities:
|
458 |
+
# Create an XML element with the appropriate tag
|
459 |
+
# Use the position of the entity to replace the right target only
|
460 |
+
xml_tag = xml_mapping(ent[0], ent[1])
|
461 |
+
start_index = ent[2]
|
462 |
+
end_index = ent[3]
|
463 |
+
descendant.text = (
|
464 |
+
descendant.text[:start_index]
|
465 |
+
+ etree.tostring(xml_tag, encoding="unicode")
|
466 |
+
+ descendant.text[end_index:]
|
467 |
+
)
|
468 |
+
|
469 |
+
# Export modified XML content
|
470 |
+
modified_xml = etree.tostring(root, xml_declaration=True, pretty_print=True, encoding="utf-8").decode("utf-8")
|
471 |
+
return modified_xml
|
472 |
+
else:
|
473 |
+
st.warning("L'élément <body> est vide.")
|
474 |
+
return None
|
475 |
+
else:
|
476 |
+
st.warning("Aucun élément <body> n'a été détecté dans le fichier XML.")
|
477 |
+
return None
|
478 |
+
except etree.XMLSyntaxError:
|
479 |
+
st.error("Format XML incorrect. Veuillez importer un fichier XML valide.")
|
480 |
+
return None
|
481 |
+
|
482 |
+
|
483 |
+
# ===== BODY OF THE PAGE =====
|
484 |
+
|
485 |
+
st.title("La reconnaissance des entités nommées dans le projet ARCHES")
|
486 |
+
st.header("Visualisation & extraction")
|
487 |
+
st.write("")
|
488 |
+
st.write("")
|
489 |
+
|
490 |
+
|
491 |
+
def main():
|
492 |
+
|
493 |
+
# Download our models
|
494 |
+
download_sentencizer()
|
495 |
+
download_ner()
|
496 |
+
|
497 |
+
# Load our models
|
498 |
+
senter = load_sentencizer()
|
499 |
+
ner = load_ner()
|
500 |
+
|
501 |
+
with st.expander("Au sujet des entités nommées recherchées"):
|
502 |
+
st.markdown("**Les différents types d'entités sont :** \n\n- **CHRONOLOGIE :** utilisé pour les références chronologiques (\"Antiquité\", \"XIIe siècle\", \"200 av. n. ère\", etc.). \n- **MOBILIER :** utilisé pour le mobilier (\"os\", \"pot\", \"tuile\", etc.). \n- **STRUCTURE :** utilisé pour les structures archéologiques (\"fosse\", \"mur\", \"fossé\", \"foyer\", etc.). \n- **MATERIAU :** utilisé pour les matériaux (\"bronze\", \"dolérite\", \"terre cuite\", etc.). \n- **ID :** utilisé pour les identifiants de vestiges (\"4\" pour \"le fossé 4\" par exemple). \n- **TECHNIQUE_STYLE :** utilisé pour les mentions de techniques et styles de fabrication ou construction (\"taillé\", \"glaçuré\", \"en petit appareil\", etc.). \n- **DECOR :** utilisé pour les éléments de décor. \n- **ESPECE :** utilisé pour signaler les taxons et noms vernaculaires rencontrés dans le texte. \n- **EDIFICE :** utilisé pour les édifices et monuments nommés (\"église Saint-Paul\", \"pont du Gard\", etc.). \n- **PEUPLE_CULTURE :** utilisé pour les cultures et peuples évoqués (tribus gauloises, cultures préhistoriques, etc.). \n- **PERSONNE :** utilisé pour les noms de personnes (historiques, fictives, équipe scientifique, etc.). \n- **ORG :** utilisé pour les institutions, sociétés, laboratoires, universités, musées, archives, etc. \n- **GPE :** utilisé pour les entités géopolitiques (villes, départements, États, etc.). \n- **LOC :** utilisé pour les lieux non-GPE (lieux naturels par exemple). \n- **LIEUDIT_SITE :** utilisé pour les lieux-dits et noms de sites archéologiques.")
|
503 |
+
st.write("")
|
504 |
+
|
505 |
+
# Select input type
|
506 |
+
use_type = st.radio("Veuillez choisir le type de données à analyser :", ("Taper un exemple", "Importer un fichier texte", "Importer un fichier xml-tei"))
|
507 |
+
st.write("")
|
508 |
+
|
509 |
+
# ===== MODE: EXAMPLE ON THE GO =====
|
510 |
+
if use_type == "Taper un exemple":
|
511 |
+
|
512 |
+
# Checkbox to apply our custom sentence segmentation model
|
513 |
+
bouton_phraseur = st.checkbox("Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
|
514 |
+
|
515 |
+
st.write("")
|
516 |
+
st.write("")
|
517 |
+
# Create a text area
|
518 |
+
raw_text = st.text_area("Veuillez saisir votre exemple dans le bloc ci-dessous (max. 5000 caractères)", "La fosse 34 a livré des restes de pinces en bronze et quelques grains d'orge.", max_chars=5000)
|
519 |
+
st.write("")
|
520 |
+
|
521 |
+
# Launch prediction
|
522 |
+
if st.button("Lancer la prédiction"):
|
523 |
+
|
524 |
+
if len(raw_text) > 0:
|
525 |
+
|
526 |
+
# If requested, apply the sentence segmentation model
|
527 |
+
if bouton_phraseur:
|
528 |
+
raw_text = apply_senter(senter, raw_text)
|
529 |
+
|
530 |
+
# Apply ner model
|
531 |
+
doc = get_doc(ner, raw_text)
|
532 |
+
entities = get_entities(doc)
|
533 |
+
|
534 |
+
st.write("")
|
535 |
+
st.subheader("Résultats :")
|
536 |
+
st.write("")
|
537 |
+
st.write("")
|
538 |
+
|
539 |
+
# Display the entities with displacy
|
540 |
+
my_displacy = create_displacy(raw_text, entities)
|
541 |
+
st.markdown(my_displacy, unsafe_allow_html=True)
|
542 |
+
|
543 |
+
st.write("")
|
544 |
+
# Download results as a conll2002 file
|
545 |
+
doc_to_conll(doc)
|
546 |
+
st.write("")
|
547 |
+
|
548 |
+
df = create_df(entities)
|
549 |
+
st.write("")
|
550 |
+
# Display the entities as a table
|
551 |
+
st.markdown("**Tableau regroupant les entités détectées**")
|
552 |
+
st.write("")
|
553 |
+
st.dataframe(df, use_container_width=True)
|
554 |
+
|
555 |
+
csv = df_to_csv(df)
|
556 |
+
|
557 |
+
st.write("")
|
558 |
+
|
559 |
+
# Download results as a csv file
|
560 |
+
st.download_button(
|
561 |
+
label="Télécharger le fichier CSV",
|
562 |
+
data=csv,
|
563 |
+
file_name="prediction_arches.csv",
|
564 |
+
mime="text/csv",
|
565 |
+
)
|
566 |
+
|
567 |
+
else:
|
568 |
+
st.warning("Veuillez saisir un exemple.")
|
569 |
+
|
570 |
+
# ===== MODE: LOAD A PLAIN TEXT FILE =====
|
571 |
+
if use_type == "Importer un fichier texte":
|
572 |
+
|
573 |
+
# Checkbox to apply our custom sentence segmentation model
|
574 |
+
bouton_phraseur = st.checkbox("Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
|
575 |
+
st.write("")
|
576 |
+
st.write("")
|
577 |
+
|
578 |
+
# Upload a plain text file
|
579 |
+
uploaded_file = st.file_uploader("Importez un fichier texte (.txt)", type="txt")
|
580 |
+
|
581 |
+
if uploaded_file is not None:
|
582 |
+
# Collect the name of the uploaded file (for the future export)
|
583 |
+
updated_name = uploaded_file.name[:-4]
|
584 |
+
|
585 |
+
stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
|
586 |
+
file_contents = stringio.read()
|
587 |
+
|
588 |
+
# Launch prediction
|
589 |
+
if st.button("Lancer la prédiction"):
|
590 |
+
|
591 |
+
if len(file_contents) > 0:
|
592 |
+
|
593 |
+
# If requested, apply the sentence segmentation model
|
594 |
+
if bouton_phraseur:
|
595 |
+
file_contents = apply_senter(senter, file_contents)
|
596 |
+
|
597 |
+
# Apply the ner model
|
598 |
+
doc = get_doc(ner, file_contents)
|
599 |
+
entities = get_entities(doc)
|
600 |
+
|
601 |
+
st.write("")
|
602 |
+
st.subheader("Résultats :")
|
603 |
+
st.write("")
|
604 |
+
st.write("")
|
605 |
+
|
606 |
+
# Display the entities with displacy
|
607 |
+
with st.expander("Voir les entités dans le texte"):
|
608 |
+
my_displacy = create_displacy(file_contents, entities)
|
609 |
+
st.markdown(my_displacy, unsafe_allow_html=True)
|
610 |
+
|
611 |
+
st.write("")
|
612 |
+
# Download the results as a conll2002 file
|
613 |
+
doc_to_conll(doc, updated_name)
|
614 |
+
st.write("")
|
615 |
+
|
616 |
+
df = create_df(entities)
|
617 |
+
st.write("")
|
618 |
+
# Display the entities as a table
|
619 |
+
with st.expander("Voir les entités sous forme de tableau"):
|
620 |
+
st.write("")
|
621 |
+
st.dataframe(df, use_container_width=True)
|
622 |
+
|
623 |
+
csv = df_to_csv(df)
|
624 |
+
|
625 |
+
st.write("")
|
626 |
+
|
627 |
+
# Download the results as a csv file
|
628 |
+
st.download_button(
|
629 |
+
label="Télécharger le fichier CSV",
|
630 |
+
data=csv,
|
631 |
+
file_name=updated_name + ".csv",
|
632 |
+
mime="text/csv",
|
633 |
+
)
|
634 |
+
|
635 |
+
else:
|
636 |
+
st.warning("Le fichier importé est vide.")
|
637 |
+
|
638 |
+
# ===== MODE: LOAD AN XML FILE =====
|
639 |
+
if use_type == "Importer un fichier xml-tei":
|
640 |
+
|
641 |
+
# User chooses between xml or conll2002 & csv export
|
642 |
+
choix_xml = st.radio("Comment souhaitez vous appliquer le modèle sur le <body> ?", ("Conserver les balises (export xml de l'intégralité du fichier importé)", "Ne pas conserver les balises (export conll2002 ou csv du <body> uniquement)"))
|
643 |
+
|
644 |
+
# ===== MODE: XML EXPORT =====
|
645 |
+
if choix_xml == "Conserver les balises (export xml de l'intégralité du fichier importé)":
|
646 |
+
|
647 |
+
# Upload an xml file
|
648 |
+
uploaded_file = st.file_uploader("Importez un fichier XML (.xml)", type="xml")
|
649 |
+
|
650 |
+
if uploaded_file is not None:
|
651 |
+
# Collect the name of the uploaded file (for the export later)
|
652 |
+
updated_name = uploaded_file.name[:-4]
|
653 |
+
file_contents = uploaded_file.read()
|
654 |
+
|
655 |
+
# Launch prediction
|
656 |
+
if st.button("Lancer la prédiction"):
|
657 |
+
|
658 |
+
if len(file_contents) > 0:
|
659 |
+
|
660 |
+
# Apply the ner model to an xml file
|
661 |
+
modified_xml = entities_to_xml(file_contents, ner)
|
662 |
+
|
663 |
+
if modified_xml is not None:
|
664 |
+
|
665 |
+
# Convert HTML entities back to characters
|
666 |
+
modified_xml = html.unescape(modified_xml)
|
667 |
+
|
668 |
+
st.write("")
|
669 |
+
|
670 |
+
with st.expander("Au sujet du mapping XML des entités"):
|
671 |
+
st.markdown(
|
672 |
+
"**Les entités ont été converties comme suit :** \n\n- **CHRONOLOGIE :** ```<date>``` \n- **MOBILIER :** ```<objectName>``` \n- **STRUCTURE :** ```<name type=\"structure\">``` \n- **MATERIAU :** ```<material>``` \n- **ID :** ```<name type=\"id\">``` \n- **TECHNIQUE_STYLE :** ```<name type=\"technique_style\">``` \n- **DECOR :** ```<name type=\"decor\">``` \n- **ESPECE :** ```<name type=\"espece\">``` \n- **EDIFICE :** ```<name type=\"edifice\">``` \n- **PEUPLE_CULTURE :** ```<orgName type=\"peuple_culture\">``` \n- **PERSONNE :** ```<persName>``` \n- **ORG :** ```<orgName>``` \n- **GPE :** ```<placeName>``` \n- **LOC :** ```<geogName>``` \n- **LIEUDIT_SITE :** ```<placeName type=\"lieudit_site\">```")
|
673 |
+
st.write("")
|
674 |
+
|
675 |
+
# Display the modified XML
|
676 |
+
with st.expander("Contenu XML modifié"):
|
677 |
+
# Wrap the code
|
678 |
+
with stylable_container(
|
679 |
+
"codeblock",
|
680 |
+
"""
|
681 |
+
code {
|
682 |
+
white-space: pre-wrap !important;
|
683 |
+
}
|
684 |
+
""",
|
685 |
+
):
|
686 |
+
st.code(modified_xml, language="xml")
|
687 |
+
|
688 |
+
# Download the modified XML
|
689 |
+
# We add a "_entites" at the end of the file name to differentiate it from the original
|
690 |
+
st.download_button(
|
691 |
+
label="Télécharger le fichier xml modifié",
|
692 |
+
data=modified_xml,
|
693 |
+
file_name=updated_name + "_entites.xml",
|
694 |
+
mime="xml",
|
695 |
+
)
|
696 |
+
|
697 |
+
# ===== MODE: CONLL2002 & CSV EXPORT =====
|
698 |
+
if choix_xml == "Ne pas conserver les balises (export conll2002 ou csv du <body> uniquement)":
|
699 |
+
|
700 |
+
st.write("")
|
701 |
+
|
702 |
+
# Checkbox to apply our custom sentence segmentation model
|
703 |
+
bouton_phraseur = st.checkbox(
|
704 |
+
"Cochez cette case pour resegmenter les phrases de votre document selon notre modèle entraîné sur des rapports d'opération")
|
705 |
+
st.write("")
|
706 |
+
st.write("")
|
707 |
+
|
708 |
+
# Upload an xml file
|
709 |
+
uploaded_file = st.file_uploader("Importez un fichier XML (.xml)", type="xml")
|
710 |
+
|
711 |
+
if uploaded_file is not None:
|
712 |
+
|
713 |
+
# Collect the name of the file (for the export later)
|
714 |
+
updated_name = uploaded_file.name[:-4]
|
715 |
+
|
716 |
+
file_contents = uploaded_file.read()
|
717 |
+
|
718 |
+
# Launch prediction
|
719 |
+
if st.button("Lancer la prédiction"):
|
720 |
+
|
721 |
+
if len(file_contents) > 0:
|
722 |
+
|
723 |
+
st.write("")
|
724 |
+
# Strip the <body> of its tags
|
725 |
+
body_text = get_body_text(file_contents)
|
726 |
+
|
727 |
+
if body_text is not None:
|
728 |
+
|
729 |
+
# If requested, apply the sentence segmentation model
|
730 |
+
if bouton_phraseur:
|
731 |
+
body_text = apply_senter(senter, body_text)
|
732 |
+
|
733 |
+
# Apply ner model
|
734 |
+
doc = get_doc(ner, body_text)
|
735 |
+
entities = get_entities(doc)
|
736 |
+
|
737 |
+
st.write("")
|
738 |
+
st.subheader("Résultats :")
|
739 |
+
st.write("")
|
740 |
+
st.write("")
|
741 |
+
|
742 |
+
# Display the entities with displacy
|
743 |
+
with st.expander("Voir les entités dans le texte"):
|
744 |
+
my_displacy = create_displacy(body_text, entities)
|
745 |
+
st.markdown(my_displacy, unsafe_allow_html=True)
|
746 |
+
|
747 |
+
st.write("")
|
748 |
+
# Download the results as a conll2002 file
|
749 |
+
doc_to_conll(doc, updated_name)
|
750 |
+
st.write("")
|
751 |
+
|
752 |
+
df = create_df(entities)
|
753 |
+
st.write("")
|
754 |
+
# Display the entities as a table
|
755 |
+
with st.expander("Voir les entités sous forme de tableau"):
|
756 |
+
st.write("")
|
757 |
+
st.dataframe(df, use_container_width=True)
|
758 |
+
|
759 |
+
csv = df_to_csv(df)
|
760 |
+
|
761 |
+
st.write("")
|
762 |
+
|
763 |
+
# Download the results as a csv file
|
764 |
+
st.download_button(
|
765 |
+
label="Télécharger le fichier CSV",
|
766 |
+
data=csv,
|
767 |
+
file_name=updated_name + ".csv",
|
768 |
+
mime="text/csv",
|
769 |
+
)
|
770 |
+
|
771 |
+
|
772 |
+
if __name__ == "__main__":
|
773 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
spacy[transformers]
|
3 |
+
pandas
|
4 |
+
lxml
|
5 |
+
beautifulsoup4
|
static/Logo_cnrs.png
ADDED
![]() |
static/logo_inrap.png
ADDED
![]() |
static/logo_ir_metopes.png
ADDED
![]() |
static/logo_mesr.png
ADDED
![]() |
static/logo_mrsh.jpg
ADDED
![]() |
static/logo_ouvrir_la_science.png
ADDED
![]() |
static/logo_unicaen.png
ADDED
![]() |