import re |
import json |
import subprocess |
import requests |
import streamlit |
import spacy |
from lxml import etree |
import pandas as pd |
CONFIG_FILE = "config.json" |
ASSETS_DIR = "assets" |
XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'} |
"EVENT": "#ec7063", |
"LOCATION": "#45b39d", |
"ORGANISATION": "#f39c12", |
"PERSON": "#3498db", |
"TITLE": "#a569bd ", |
"LOC": "#45b39d", |
"MISC": "#ec7063", |
"ORG": "#f39c12", |
"PER": "#3498db" |
} |
"":"", |
"fr_ner4archives_V3_camembert_base": "https://huggingface.co/ner4archives/fr_ner4archives_V3_camembert_base/resolve/main/fr_ner4archives_V3_camembert_base-any-py3-none-any.whl#sha256=919aca257e4b2342a3773e7dcc3442b30f2ac50c4a3466d5ecda5e0e337ea396", |
"fr_ner4archives_v3_default": "https://huggingface.co/ner4archives/fr_ner4archives_v3_default/resolve/main/fr_ner4archives_v3_default-any-py3-none-any.whl#sha256=040a5c67a7e5a14e0125b2600e6aff1323f8ba9d800d5ab49446faeaf2dad6e6", |
"fr_ner4archives_v3_with_vectors":"https://huggingface.co/ner4archives/fr_ner4archives_v3_with_vectors/resolve/main/fr_ner4archives_v3_with_vectors-any-py3-none-any.whl#sha256=4c938618ce84a919008d17822980cfd914f9ad8f73231a64cd1ffcaa80ab09f2" |
} |
with open(CONFIG_FILE, mode="r") as json_file: |
CONFIGURATION = json.loads(json_file.read()) |
streamlit.set_page_config(layout="wide") |
streamlit.title("NER4Archives visualizer") |
def ead_strategy(tree): |
sentences = [] |
container_dids = [] |
dsc = tree.xpath('.//dsc') |
for chlidren_dsc in dsc: |
for did in chlidren_dsc.xpath('.//did'): |
container_dids.append(did) |
text = "" |
if did is not None: |
text += " ".join( |
[did_content.strip() for did_content in did.itertext() if len(did_content) > 0]) |
if did.getnext() is not None: |
text += " ".join( |
[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if |
len(scopecontent) > 0]) |
sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ") |
return container_dids, sentences |
def process_xml(data): |
parser = etree.XMLParser(**XML_PARSER_CONFIG) |
tree = etree.fromstring(data, parser=parser) |
xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8") |
dids, sentences = ead_strategy(tree) |
return xml, dids, sentences |
def is_entity_fishing_online(): |
try: |
response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1])) |
if response.status_code == 200: |
return True |
else: |
return False |
except: |
return False |
def setup_sidebar(): |
streamlit.sidebar.title("NER4Archives visualizer") |
streamlit.sidebar.write("## Motivation") |
streamlit.sidebar.markdown("""<div style="text-align: justify;"> |
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on |
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p> |
<p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset |
extracted from XML EAD finding aids and test it on new data.<p> |
<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a> |
framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>. |
Other models may be added in the future.</p> |
<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p> |
NER4Archives - 2022/2023</div> |
""", unsafe_allow_html=True) |
scol1, scol2 = streamlit.sidebar.columns(2) |
scol1.image(f"{ASSETS_DIR}/an.png", width=170) |
scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100) |
def main(): |
setup_sidebar() |
flag_file = False |
flag_model = False |
data = "" |
model = "" |
linking = True |
entities = [] |
streamlit.write("## π Input XML EAD:") |
filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml") |
streamlit.markdown( |
"or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory") |
if filename is not None: |
data = filename.getvalue().decode("utf-8").encode("utf-8") |
if len(data) > 0: |
flag_file = True |
if flag_file: |
col1, col2 = streamlit.columns(2) |
col1.write("## ποΈ XML tree view:") |
col2.write("## ποΈ Plain text view:") |
xml, _, sentences = process_xml(data) |
col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True) |
plain = "\n".join(sentences) |
col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True) |
flag_view = True |
flag_model = False |
if flag_view: |
streamlit.write("## βοΈ Configure NER pipeline and options:") |
streamlit.write("β οΈ Using Bert based model and/or linking may increase considerably the processing time.") |
models = [] |
for pipe in spacy.info()["pipelines"]: |
models.append(pipe) |
option = streamlit.selectbox( |
'Choose a NER model you want to apply in the list: ', |
models) |
model = option |
if model != "": |
flag_model = True |
if flag_model: |
gpu = streamlit.checkbox('Check to use GPU (if available)', value=False) |
gpu_icon = "β" |
if gpu: |
spacy.prefer_gpu() |
gpu_icon = "β
οΈ" |
else: |
spacy.require_cpu() |
if is_entity_fishing_online(): |
streamlit.write("Entity-fishing server status: π’ (you can use linking feature)") |
linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', |
value=False) |
linkingicon = "β
οΈ" |
if linking is False: |
linkingicon = "β" |
else: |
streamlit.write("Entity-fishing server status: π΄ (you can't use linking feature)") |
linking = False |
linkingicon = "β" |
streamlit.write("#### Actual Parameters:") |
streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}') |
if flag_model: |
if streamlit.button('Launch'): |
plain = "\n".join(sentences) |
with streamlit.spinner('Initialize NER...'): |
nlp = spacy.load(model) |
nlp.max_length = 5000000 |
if linking: |
nlp.add_pipe('entityfishing', |
config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']}) |
with streamlit.spinner('NER processing...'): |
if linking: |
start_sentence = 0 |
for doc in nlp.pipe(sentences): |
end_sentence = start_sentence + len(doc.text) + 1 |
for ent in doc.ents: |
start_tok = start_sentence + ent.start_char |
end_tok = start_tok + len(ent.text) |
entities.append(( |
start_tok, |
end_tok, |
ent.text, |
ent.label_, |
ent._.kb_qid, |
ent._.url_wikidata, |
ent._.nerd_score |
)) |
start_sentence = end_sentence |
else: |
start_sentence = 0 |
for doc in nlp.pipe(sentences): |
end_sentence = start_sentence + len(doc.text) + 1 |
for ent in doc.ents: |
start_tok = start_sentence + ent.start_char |
end_tok = start_tok + len(ent.text) |
entities.append((start_tok, |
end_tok, |
ent.text, |
ent.label_, |
"", |
"", |
"" |
)) |
start_sentence = end_sentence |
streamlit.success('π NER applied with success!') |
df = pd.DataFrame(entities, columns=['START', |
'END', |
'QID', |
'WIKIDATA RESSOURCE (wikidata disambiguation)', |
]) |
df[['START', 'END']] = df[['START', 'END']].astype(int) |
streamlit.write("## π Explore named entities in table: ") |
streamlit.write(df) |
streamlit.write("## π Explore named entities in text: ") |
ents_html = spacy.displacy.render( |
[{"text": plain, |
"ents": [{"start": ent[0], |
"end": ent[1], |
"label": ent[3], |
"kb_id": ent[4] if linking else "", |
"kb_url": ent[5] if linking else "" |
} for ent in entities]}], |
style="ent", |
manual=True, |
options={ |
"colors": ENTITY_COLORS |
}) |
streamlit.markdown(ents_html, unsafe_allow_html=True) |
if __name__ == "__main__": |
main() |