|
from typing import Tuple |
|
|
|
import spacy |
|
from presidio_analyzer import RecognizerRegistry |
|
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider |
|
|
|
|
|
def create_nlp_engine_with_spacy( |
|
model_path: str, |
|
) -> Tuple[NlpEngine, RecognizerRegistry]: |
|
""" |
|
Instantiate an NlpEngine with a spaCy model |
|
:param model_path: spaCy model path. |
|
""" |
|
registry = RecognizerRegistry() |
|
registry.load_predefined_recognizers() |
|
|
|
if not spacy.util.is_package(model_path): |
|
spacy.cli.download(model_path) |
|
|
|
nlp_configuration = { |
|
"nlp_engine_name": "spacy", |
|
"models": [{"lang_code": "en", "model_name": model_path}], |
|
} |
|
|
|
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() |
|
|
|
return nlp_engine, registry |
|
|
|
|
|
def create_nlp_engine_with_transformers( |
|
model_path: str, |
|
) -> Tuple[NlpEngine, RecognizerRegistry]: |
|
""" |
|
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model. |
|
The TransformersRecognizer would return results from Transformers models, the spaCy model |
|
would return NlpArtifacts such as POS and lemmas. |
|
:param model_path: HuggingFace model path. |
|
""" |
|
|
|
from transformers_rec import ( |
|
STANFORD_COFIGURATION, |
|
BERT_DEID_CONFIGURATION, |
|
TransformersRecognizer, |
|
) |
|
|
|
registry = RecognizerRegistry() |
|
registry.load_predefined_recognizers() |
|
|
|
if not spacy.util.is_package("en_core_web_sm"): |
|
spacy.cli.download("en_core_web_sm") |
|
|
|
transformers_recognizer = TransformersRecognizer(model_path=model_path) |
|
|
|
if model_path == "StanfordAIMI/stanford-deidentifier-base": |
|
transformers_recognizer.load_transformer(**STANFORD_COFIGURATION) |
|
elif model_path == "obi/deid_roberta_i2b2": |
|
transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) |
|
else: |
|
print(f"Warning: Model has no configuration, loading default.") |
|
transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION) |
|
|
|
|
|
|
|
nlp_configuration = { |
|
"nlp_engine_name": "spacy", |
|
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], |
|
} |
|
|
|
registry.add_recognizer(transformers_recognizer) |
|
registry.remove_recognizer("SpacyRecognizer") |
|
|
|
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() |
|
|
|
return nlp_engine, registry |
|
|
|
|
|
def create_nlp_engine_with_flair( |
|
model_path: str, |
|
) -> Tuple[NlpEngine, RecognizerRegistry]: |
|
""" |
|
Instantiate an NlpEngine with a FlairRecognizer and a small spaCy model. |
|
The FlairRecognizer would return results from Flair models, the spaCy model |
|
would return NlpArtifacts such as POS and lemmas. |
|
:param model_path: Flair model path. |
|
""" |
|
from flair_recognizer import FlairRecognizer |
|
|
|
registry = RecognizerRegistry() |
|
registry.load_predefined_recognizers() |
|
|
|
if not spacy.util.is_package("en_core_web_sm"): |
|
spacy.cli.download("en_core_web_sm") |
|
|
|
flair_recognizer = FlairRecognizer(model_path=model_path) |
|
nlp_configuration = { |
|
"nlp_engine_name": "spacy", |
|
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], |
|
} |
|
registry.add_recognizer(flair_recognizer) |
|
registry.remove_recognizer("SpacyRecognizer") |
|
|
|
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() |
|
|
|
return nlp_engine, registry |
|
|
|
|
|
def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str): |
|
""" |
|
Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model. |
|
The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model |
|
would return NlpArtifacts such as POS and lemmas. |
|
:param ta_key: Azure Text Analytics key. |
|
:param ta_endpoint: Azure Text Analytics endpoint. |
|
""" |
|
from text_analytics_wrapper import TextAnalyticsWrapper |
|
|
|
if not ta_key or not ta_endpoint: |
|
raise RuntimeError("Please fill in the Text Analytics endpoint details") |
|
|
|
registry = RecognizerRegistry() |
|
registry.load_predefined_recognizers() |
|
|
|
ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key) |
|
nlp_configuration = { |
|
"nlp_engine_name": "spacy", |
|
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}], |
|
} |
|
|
|
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine() |
|
|
|
registry.add_recognizer(ta_recognizer) |
|
registry.remove_recognizer("SpacyRecognizer") |
|
|
|
return nlp_engine, registry |
|
|