import logging from typing import Optional, List, Tuple, Set from presidio_analyzer import ( RecognizerResult, EntityRecognizer, AnalysisExplanation, ) from presidio_analyzer.nlp_engine import NlpArtifacts logger = logging.getLogger("presidio-analyzer") try: from transformers import ( AutoTokenizer, AutoModelForTokenClassification, pipeline, models, ) from transformers.models.bert.modeling_bert import BertForTokenClassification except ImportError: logger.error("transformers is not installed") class TransformersRecognizer(EntityRecognizer): """ Wrapper for a transformers model, if needed to be used within Presidio Analyzer. :example: >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry >transformers_recognizer = TransformersRecognizer() >registry = RecognizerRegistry() >registry.add_recognizer(transformers_recognizer) >analyzer = AnalyzerEngine(registry=registry) >results = analyzer.analyze( > "My name is Christopher and I live in Irbid.", > language="en", > return_decision_process=True, >) >for result in results: > print(result) > print(result.analysis_explanation) """ ENTITIES = [ "LOCATION", "PERSON", "ORGANIZATION", "AGE", "ID", "PHONE", "EMAIL", "DATE", ] DEFAULT_EXPLANATION = "Identified as {} by transformers's Named Entity Recognition" CHECK_LABEL_GROUPS = [ ({"LOCATION"}, {"LOC", "HOSP"}), ({"PERSON"}, {"PER", "PERSON", "STAFF","PATIENT"}), ({"ORGANIZATION"}, {"ORGANIZATION", "ORG", "PATORG"}), ({"AGE"}, {"AGE"}), ({"ID"}, {"ID"}), ({"EMAIL"}, {"EMAIL"}), ({"DATE"}, {"DATE"}), ] PRESIDIO_EQUIVALENCES = { "PER": "PERSON", "LOC": "LOCATION", "ORG": "ORGANIZATION", "AGE": "AGE", "ID": "ID", "EMAIL": "EMAIL" } DEFAULT_MODEL_PATH = "obi/deid_roberta_i2b2" def __init__( self, supported_entities: Optional[List[str]] = None, check_label_groups: Optional[Tuple[Set, Set]] = None, model: Optional[BertForTokenClassification] = None, model_path: Optional[str] = None, ): if not model and not model_path: model_path = self.DEFAULT_MODEL_PATH logger.warning( f"Both 'model' and 'model_path' arguments are None. Using default model_path={model_path}" ) if model and model_path: logger.warning( f"Both 'model' and 'model_path' arguments were provided. Ignoring the model_path" ) self.check_label_groups = ( check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS ) supported_entities = supported_entities if supported_entities else self.ENTITIES self.model = ( model if model else pipeline( "ner", model=AutoModelForTokenClassification.from_pretrained(model_path), tokenizer=AutoTokenizer.from_pretrained(model_path), aggregation_strategy="simple", ) ) super().__init__( supported_entities=supported_entities, name="transformers Analytics", ) def load(self) -> None: """Load the model, not used. Model is loaded during initialization.""" pass def get_supported_entities(self) -> List[str]: """ Return supported entities by this model. :return: List of the supported entities. """ return self.supported_entities # Class to use transformers with Presidio as an external recognizer. def analyze( self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None ) -> List[RecognizerResult]: """ Analyze text using Text Analytics. :param text: The text for analysis. :param entities: Not working properly for this recognizer. :param nlp_artifacts: Not used by this recognizer. :return: The list of Presidio RecognizerResult constructed from the recognized transformers detections. """ results = [] ner_results = self.model(text) # If there are no specific list of entities, we will look for all of it. if not entities: entities = self.supported_entities for entity in entities: if entity not in self.supported_entities: continue for res in ner_results: if not self.__check_label( entity, res["entity_group"], self.check_label_groups ): continue textual_explanation = self.DEFAULT_EXPLANATION.format( res["entity_group"] ) explanation = self.build_transformers_explanation( round(res["score"], 2), textual_explanation ) transformers_result = self._convert_to_recognizer_result( res, explanation ) results.append(transformers_result) return results def _convert_to_recognizer_result(self, res, explanation) -> RecognizerResult: entity_type = self.PRESIDIO_EQUIVALENCES.get( res["entity_group"], res["entity_group"] ) transformers_score = round(res["score"], 2) transformers_results = RecognizerResult( entity_type=entity_type, start=res["start"], end=res["end"], score=transformers_score, analysis_explanation=explanation, ) return transformers_results def build_transformers_explanation( self, original_score: float, explanation: str ) -> AnalysisExplanation: """ Create explanation for why this result was detected. :param original_score: Score given by this recognizer :param explanation: Explanation string :return: """ explanation = AnalysisExplanation( recognizer=self.__class__.__name__, original_score=original_score, textual_explanation=explanation, ) return explanation @staticmethod def __check_label( entity: str, label: str, check_label_groups: Tuple[Set, Set] ) -> bool: return any( [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups] ) if __name__ == "__main__": from presidio_analyzer import AnalyzerEngine, RecognizerRegistry transformers_recognizer = ( TransformersRecognizer() ) # This would download a large (~500Mb) model on the first run registry = RecognizerRegistry() registry.add_recognizer(transformers_recognizer) analyzer = AnalyzerEngine(registry=registry) results = analyzer.analyze( "My name is Christopher and I live in Irbid.", language="en", return_decision_process=True, ) for result in results: print(result) print(result.analysis_explanation)