presidio / presidio_nlp_engine_config.py
omri374's picture
Upload 12 files
41e004f
import logging
from typing import Tuple
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import (
NlpEngine,
NlpEngineProvider,
)
logger = logging.getLogger("presidio-streamlit")
def create_nlp_engine_with_spacy(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a spaCy model
:param model_path: path to model / model name.
"""
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": model_path}],
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
"PER": "PERSON",
"PERSON": "PERSON",
"NORP": "NRP",
"FAC": "FACILITY",
"LOC": "LOCATION",
"GPE": "LOCATION",
"LOCATION": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
},
"low_confidence_score_multiplier": 0.4,
"low_score_entity_names": ["ORG", "ORGANIZATION"],
},
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
return nlp_engine, registry
def create_nlp_engine_with_stanza(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a stanza model
:param model_path: path to model / model name.
"""
nlp_configuration = {
"nlp_engine_name": "stanza",
"models": [{"lang_code": "en", "model_name": model_path}],
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
"PER": "PERSON",
"PERSON": "PERSON",
"NORP": "NRP",
"FAC": "FACILITY",
"LOC": "LOCATION",
"GPE": "LOCATION",
"LOCATION": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
}
},
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
return nlp_engine, registry
def create_nlp_engine_with_transformers(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
The TransformersRecognizer would return results from Transformers models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: HuggingFace model path.
"""
print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
nlp_configuration = {
"nlp_engine_name": "transformers",
"models": [
{
"lang_code": "en",
"model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
}
],
"ner_model_configuration": {
"model_to_presidio_entity_mapping": {
"PER": "PERSON",
"PERSON": "PERSON",
"LOC": "LOCATION",
"LOCATION": "LOCATION",
"GPE": "LOCATION",
"ORG": "ORGANIZATION",
"ORGANIZATION": "ORGANIZATION",
"NORP": "NRP",
"AGE": "AGE",
"ID": "ID",
"EMAIL": "EMAIL",
"PATIENT": "PERSON",
"STAFF": "PERSON",
"HOSP": "ORGANIZATION",
"PATORG": "ORGANIZATION",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"PHONE": "PHONE_NUMBER",
"HCW": "PERSON",
"HOSPITAL": "ORGANIZATION",
"FACILITY": "LOCATION",
},
"low_confidence_score_multiplier": 0.4,
"low_score_entity_names": ["ID"],
"labels_to_ignore": [
"CARDINAL",
"EVENT",
"LANGUAGE",
"LAW",
"MONEY",
"ORDINAL",
"PERCENT",
"PRODUCT",
"QUANTITY",
"WORK_OF_ART",
],
},
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry.load_predefined_recognizers(nlp_engine=nlp_engine)
return nlp_engine, registry
def create_nlp_engine_with_flair(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model.
The FlairRecognizer would return results from Flair models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: Flair model path.
"""
from flair_recognizer import FlairRecognizer
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
# there is no official Flair NlpEngine, hence we load it as an additional recognizer
if not spacy.util.is_package("en_core_web_sm"):
spacy.cli.download("en_core_web_sm")
# Using a small spaCy model + a Flair NER model
flair_recognizer = FlairRecognizer(model_path=model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
registry.add_recognizer(flair_recognizer)
registry.remove_recognizer("SpacyRecognizer")
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
return nlp_engine, registry
def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
"""
Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param ta_key: Azure Text Analytics key.
:param ta_endpoint: Azure Text Analytics endpoint.
"""
from azure_ai_language_wrapper import AzureAIServiceWrapper
if not ta_key or not ta_endpoint:
raise RuntimeError("Please fill in the Text Analytics endpoint details")
registry = RecognizerRegistry()
registry.load_predefined_recognizers()
azure_ai_language_recognizer = AzureAIServiceWrapper(
ta_endpoint=ta_endpoint, ta_key=ta_key
)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry.add_recognizer(azure_ai_language_recognizer)
registry.remove_recognizer("SpacyRecognizer")
return nlp_engine, registry