Spaces:

ner4archives
/

ner4archives-NEL-vizualizer-app

Sleeping

App Files Files Community

ner4archives-NEL-vizualizer-app / app.py

lterriel

Update app.py

af032b9 about 1 year ago

raw

history blame contribute delete

12.3 kB

	import re
	import json
	import subprocess

	import requests

	import streamlit
	import spacy
	from lxml import etree
	import pandas as pd

	# Constants
	CONFIG_FILE = "config.json"
	ASSETS_DIR = "assets"
	XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'}
	ENTITY_COLORS = {
	"EVENT": "#ec7063",
	"LOCATION": "#45b39d",
	"ORGANISATION": "#f39c12",
	"PERSON": "#3498db",
	"TITLE": "#a569bd ",
	"LOC": "#45b39d",
	"MISC": "#ec7063",
	"ORG": "#f39c12",
	"PER": "#3498db"
	}

	MAP_MODELS = {
	"":"",
	# "fr_ner4archives_V3_camembert_base": "https://huggingface.co/ner4archives/fr_ner4archives_V3_camembert_base/resolve/main/fr_ner4archives_V3_camembert_base-any-py3-none-any.whl", # Use this only locally (not in HF Spaces)
	"fr_ner4archives_v3_default": "https://huggingface.co/ner4archives/fr_ner4archives_v3_default/resolve/main/fr_ner4archives_v3_default-any-py3-none-any.whl",
	"fr_ner4archives_v3_with_vectors":"https://huggingface.co/ner4archives/fr_ner4archives_v3_with_vectors/resolve/main/fr_ner4archives_v3_with_vectors-any-py3-none-any.whl"
	}


	# Read configuration
	with open(CONFIG_FILE, mode="r") as json_file:
	CONFIGURATION = json.loads(json_file.read())

	# Set up Streamlit page
	streamlit.set_page_config(layout="wide")
	streamlit.title("NER4Archives visualizer")


	def ead_strategy(tree):
	sentences = []
	container_dids = []
	# get the <dsc> level
	dsc = tree.xpath('.//dsc')
	for chlidren_dsc in dsc:
	# get <did> levels
	for did in chlidren_dsc.xpath('.//did'):
	container_dids.append(did)
	text = ""
	if did is not None:
	text += " ".join(
	[did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
	# get the scopecontent if exists and concatenate with the rest
	if did.getnext() is not None:
	text += " ".join(
	[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
	len(scopecontent) > 0])
	sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
	# assert len(sentences) == len(container_dids)
	return container_dids, sentences


	def process_xml(data):
	parser = etree.XMLParser(**XML_PARSER_CONFIG)
	tree = etree.fromstring(data, parser=parser)
	xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
	dids, sentences = ead_strategy(tree)
	return xml, dids, sentences


	def is_entity_fishing_online():
	try:
	response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1]))
	if response.status_code == 200:
	return True
	else:
	return False
	except:
	return False


	def setup_sidebar():
	streamlit.sidebar.title("NER4Archives visualizer")
	streamlit.sidebar.write("## Motivation")
	streamlit.sidebar.markdown("""<div style="text-align: justify;">
	<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
	XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
	<p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
	extracted from XML EAD finding aids and test it on new data.<p>
	<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
	framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
	Other models may be added in the future.</p>
	<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
	NER4Archives - 2022/2023</div>
	""", unsafe_allow_html=True)
	scol1, scol2 = streamlit.sidebar.columns(2)
	scol1.image(f"{ASSETS_DIR}/an.png", width=170)
	scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100)


	def main():
	setup_sidebar()
	flag_file = False
	flag_model = False
	data = ""
	model = ""
	linking = True
	entities = []
	# 1. User provides a XML EAD
	streamlit.write("## 📄 Input XML EAD:")
	filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml")
	streamlit.markdown(
	"or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/tree/main/samples) directory")

	if filename is not None:
	data = filename.getvalue().decode("utf-8").encode("utf-8")
	if len(data) > 0:
	flag_file = True

	if flag_file:
	col1, col2 = streamlit.columns(2)
	col1.write("## 👁️ XML tree view:")
	col2.write("## 👁️ Plain text view:")
	xml, _, sentences = process_xml(data)
	col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True)
	plain = "\n".join(sentences)
	col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True)
	flag_view = True
	flag_model = False

	if flag_view:
	streamlit.write("## ⚙️ Configure NER pipeline and options:")
	streamlit.write(
	"⚠️ Using Bert based model and/or linking may increase considerably the processing time.")
	# Normaly: Load from PIP or directory (install issues with HF spaces)
	models = [str(key) for key in MAP_MODELS.keys()]
	option = streamlit.selectbox(
	'Choose a NER model you want to apply in the list: ',
	models,
	index=0)
	model = option
	model_loaded = None
	if model != "":
	try:
	spacy.load(model)
	flag_model = True
	streamlit.write(f"{model} is available locally.")
	except:
	placeholder = streamlit.empty()
	button = streamlit.button(f"Download model: {model}")
	with placeholder.container():
	if button:
	streamlit.write(f"Download model: {model} in progress...")
	p1 = subprocess.Popen(["pip", "install", MAP_MODELS[model]])
	o = p1.wait()
	if o == 0:
	streamlit.write(f"Download model: {model} done.")
	flag_model = True
	streamlit.write(f"{model} is available locally.")
	placeholder.empty()

	if flag_model:
	gpu = streamlit.checkbox('Check to use GPU (if available)', value=False)
	gpu_icon = "❌"
	if gpu:
	spacy.prefer_gpu()
	gpu_icon = "✅️"
	else:
	spacy.require_cpu()

	if is_entity_fishing_online():
	streamlit.write("Entity-fishing server status: 🟢 (you can use linking feature)")
	linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)',
	value=False)
	linkingicon = "✅️"
	if linking is False:
	linkingicon = "❌"
	else:
	streamlit.write("Entity-fishing server status: 🔴 (you can't use linking feature)")
	linking = False
	linkingicon = "❌"
	streamlit.write("#### Actual Parameters:")
	streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}')

	# Launch NER process:
	if flag_model:
	if streamlit.button('Launch'):
	plain = "\n".join(sentences)
	with streamlit.spinner('Initialize NER...'):
	nlp = spacy.load(model)
	nlp.max_length = 5000000
	if linking:
	nlp.add_pipe('entityfishing',
	config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})

	with streamlit.spinner('NER processing...'):
	if linking:
	start_sentence = 0
	for doc in nlp.pipe(sentences):
	end_sentence = start_sentence + len(doc.text) + 1
	for ent in doc.ents:
	start_tok = start_sentence + ent.start_char
	end_tok = start_tok + len(ent.text)
	entities.append((
	start_tok,
	end_tok,
	ent.text,
	ent.label_,
	ent._.kb_qid,
	ent._.url_wikidata,
	ent._.nerd_score
	))
	start_sentence = end_sentence
	else:
	start_sentence = 0
	for doc in nlp.pipe(sentences):
	end_sentence = start_sentence + len(doc.text) + 1
	for ent in doc.ents:
	start_tok = start_sentence + ent.start_char
	end_tok = start_tok + len(ent.text)
	entities.append((start_tok,
	end_tok,
	ent.text,
	ent.label_,
	"",
	"",
	""
	))
	start_sentence = end_sentence

	streamlit.success('😃 NER applied with success!')

	df = pd.DataFrame(entities, columns=['START',
	'END',
	'MENTION',
	'NER LABEL',
	'QID',
	'WIKIDATA RESSOURCE (wikidata disambiguation)',
	'LINKING SCORE'
	])
	df[['START', 'END']] = df[['START', 'END']].astype(int)
	streamlit.write("## 🔎 Explore named entities in table: ")
	streamlit.write(df)

	streamlit.write("## 🔎 Explore named entities in text: ")
	ents_html = spacy.displacy.render(
	[{"text": plain,
	"ents": [{"start": ent[0],
	"end": ent[1],
	"label": ent[3],
	"kb_id": ent[4] if linking else "",
	"kb_url": ent[5] if linking else ""
	} for ent in entities]}],
	style="ent",
	manual=True,
	options={
	"ents":["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
	"colors": ENTITY_COLORS
	})
	streamlit.markdown(ents_html, unsafe_allow_html=True)


	if __name__ == "__main__":
	main()