Spaces:

beki
/

pii-anonymizer

Build error

App Files Files Community

beki commited on Sep 22, 2022

Commit

62db53d

•

1 Parent(s): fec18f3

Create new file

Browse files

Files changed (1) hide show

flair_recognizer.py +233 -0

flair_recognizer.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import logging
+from typing import Optional, List, Tuple, Set
+from presidio_analyzer import (
+    RecognizerResult,
+    EntityRecognizer,
+    AnalysisExplanation,
+)
+from presidio_analyzer.nlp_engine import NlpArtifacts
+try:
+    from flair.data import Sentence
+    from flair.models import SequenceTagger
+except ImportError:
+    print("Flair is not installed")
+logger = logging.getLogger("presidio-analyzer")
+class FlairRecognizer(EntityRecognizer):
+    """
+    Wrapper for a flair model, if needed to be used within Presidio Analyzer.
+    :example:
+    >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    >flair_recognizer = FlairRecognizer()
+    >registry = RecognizerRegistry()
+    >registry.add_recognizer(flair_recognizer)
+    >analyzer = AnalyzerEngine(registry=registry)
+    >results = analyzer.analyze(
+    >    "My name is Christopher and I live in Irbid.",
+    >    language="en",
+    >    return_decision_process=True,
+    >)
+    >for result in results:
+    >    print(result)
+    >    print(result.analysis_explanation)
+    """
+    ENTITIES = [
+        "LOCATION",
+        "PERSON",
+        "NRP",
+        "GPE",
+        "ORGANIZATION",
+        "MAC_ADDRESS",
+        "US_BANK_NUMBER",
+        "IMEI",
+        "TITLE",
+        "LICENSE_PLATE",
+        "US_PASSPORT",
+        "CURRENCY",
+        "ROUTING_NUMBER",
+        "US_ITIN",
+        "US_BANK_NUMBER",
+        "US_DRIVER_LICENSE",
+    ]
+    DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
+    CHECK_LABEL_GROUPS = [
+        ({"LOCATION"}, {"LOC", "LOCATION", "STREET_ADDRESS", "COORDINATE"}),
+        ({"PERSON"}, {"PER", "PERSON"}),
+        ({"NRP"}, {"NORP", "NRP"}),
+        ({"GPE"}, {"GPE"}),
+        ({"ORGANIZATION"}, {"ORG"}),
+        ({"MAC_ADDRESS"}, {"MAC_ADDRESS"}),
+        ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
+        ({"IMEI"}, {"IMEI"}),
+        ({"TITLE"}, {"TITLE"}),
+        ({"LICENSE_PLATE"}, {"LICENSE_PLATE"}),
+        ({"US_PASSPORT"}, {"US_PASSPORT"}),
+        ({"CURRENCY"}, {"CURRENCY"}),
+        ({"ROUTING_NUMBER"}, {"ROUTING_NUMBER"}),
+        # ({"US_ITIN"}, {"US_ITIN"}),
+        # ({"US_BANK_NUMBER"}, {"US_BANK_NUMBER"}),
+        # ({"US_DRIVER_LICENSE"}, {"US_DRIVER_LICENSE"}),
+    ]
+    MODEL_LANGUAGES = {
+        "en": "beki/flair-ner-debug-english",
+        # "es": "flair/ner-spanish-large",
+        # "de": "flair/ner-german-large",
+        # "nl": "flair/ner-dutch-large",
+    }
+    PRESIDIO_EQUIVALENCES = {
+        "PER": "PERSON",
+        "LOC": "LOCATION",
+        "ORG": "ORGANIZATION",
+        # 'MISC': 'MISCELLANEOUS'   # - Probably not PII
+    }
+    def __init__(
+        self,
+        supported_language: str = "en",
+        supported_entities: Optional[List[str]] = None,
+        check_label_groups: Optional[Tuple[Set, Set]] = None,
+        model: SequenceTagger = None,
+    ):
+        self.check_label_groups = (
+            check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
+        )
+        supported_entities = supported_entities if supported_entities else self.ENTITIES
+        self.model = (
+            model
+            if model
+            else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
+        )
+        super().__init__(
+            supported_entities=supported_entities,
+            supported_language=supported_language,
+            name="Flair Analytics",
+        )
+    def load(self) -> None:
+        """Load the model, not used. Model is loaded during initialization."""
+        pass
+    def get_supported_entities(self) -> List[str]:
+        """
+        Return supported entities by this model.
+        :return: List of the supported entities.
+        """
+        return self.supported_entities
+    # Class to use Flair with Presidio as an external recognizer.
+    def analyze(
+        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
+    ) -> List[RecognizerResult]:
+        """
+        Analyze text using Text Analytics.
+        :param text: The text for analysis.
+        :param entities: Not working properly for this recognizer.
+        :param nlp_artifacts: Not used by this recognizer.
+        :param language: Text language. Supported languages in MODEL_LANGUAGES
+        :return: The list of Presidio RecognizerResult constructed from the recognized
+            Flair detections.
+        """
+        results = []
+        sentences = Sentence(text)
+        self.model.predict(sentences)
+        # If there are no specific list of entities, we will look for all of it.
+        if not entities:
+            entities = self.supported_entities
+        for entity in entities:
+            if entity not in self.supported_entities:
+                continue
+            for ent in sentences.get_spans("ner"):
+                if not self.__check_label(
+                    entity, ent.labels[0].value, self.check_label_groups
+                ):
+                    continue
+                textual_explanation = self.DEFAULT_EXPLANATION.format(
+                    ent.labels[0].value
+                )
+                explanation = self.build_flair_explanation(
+                    round(ent.score, 2), textual_explanation
+                )
+                flair_result = self._convert_to_recognizer_result(ent, explanation)
+                results.append(flair_result)
+        return results
+    def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
+        entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
+        flair_score = round(entity.score, 2)
+        flair_results = RecognizerResult(
+            entity_type=entity_type,
+            start=entity.start_position,
+            end=entity.end_position,
+            score=flair_score,
+            analysis_explanation=explanation,
+        )
+        return flair_results
+    def build_flair_explanation(
+        self, original_score: float, explanation: str
+    ) -> AnalysisExplanation:
+        """
+        Create explanation for why this result was detected.
+        :param original_score: Score given by this recognizer
+        :param explanation: Explanation string
+        :return:
+        """
+        explanation = AnalysisExplanation(
+            recognizer=self.__class__.__name__,
+            original_score=original_score,
+            textual_explanation=explanation,
+        )
+        return explanation
+    @staticmethod
+    def __check_label(
+        entity: str, label: str, check_label_groups: Tuple[Set, Set]
+    ) -> bool:
+        return any(
+            [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
+        )
+if __name__ == "__main__":
+    from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
+    flair_recognizer = (
+        FlairRecognizer()
+    )  # This would download a very large (+2GB) model on the first run
+    registry = RecognizerRegistry()
+    registry.add_recognizer(flair_recognizer)
+    analyzer = AnalyzerEngine(registry=registry)
+    results = analyzer.analyze(
+        "{first_name: Moustafa, sale_id: 235234}",
+        language="en",
+        return_decision_process=True,
+    )
+    for result in results:
+        print(result)
+        print(result.analysis_explanation)