Spaces:
Running
Running
import os | |
from typing import List, Optional | |
import logging | |
import dotenv | |
from azure.ai.textanalytics import TextAnalyticsClient | |
from azure.core.credentials import AzureKeyCredential | |
from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation | |
from presidio_analyzer.nlp_engine import NlpArtifacts | |
logger = logging.getLogger("presidio-streamlit") | |
class TextAnalyticsWrapper(EntityRecognizer): | |
from azure.ai.textanalytics._models import PiiEntityCategory | |
TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory] | |
def __init__( | |
self, | |
supported_entities: Optional[List[str]] = None, | |
supported_language: str = "en", | |
ta_client: Optional[TextAnalyticsClient] = None, | |
ta_key: Optional[str] = None, | |
ta_endpoint: Optional[str] = None, | |
): | |
""" | |
Wrapper for the Azure Text Analytics client | |
:param ta_client: object of type TextAnalyticsClient | |
:param ta_key: Azure cognitive Services for Language key | |
:param ta_endpoint: Azure cognitive Services for Language endpoint | |
""" | |
if not supported_entities: | |
supported_entities = self.TA_SUPPORTED_ENTITIES | |
super().__init__( | |
supported_entities=supported_entities, | |
supported_language=supported_language, | |
name="Azure Text Analytics PII", | |
) | |
self.ta_key = ta_key | |
self.ta_endpoint = ta_endpoint | |
if not ta_client: | |
ta_client = self.__authenticate_client(ta_key, ta_endpoint) | |
self.ta_client = ta_client | |
def __authenticate_client(key: str, endpoint: str): | |
ta_credential = AzureKeyCredential(key) | |
text_analytics_client = TextAnalyticsClient( | |
endpoint=endpoint, credential=ta_credential | |
) | |
return text_analytics_client | |
def analyze( | |
self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None | |
) -> List[RecognizerResult]: | |
if not entities: | |
entities = [] | |
response = self.ta_client.recognize_pii_entities( | |
[text], language=self.supported_language | |
) | |
results = [doc for doc in response if not doc.is_error] | |
recognizer_results = [] | |
for res in results: | |
for entity in res.entities: | |
if entity.category not in self.supported_entities: | |
continue | |
analysis_explanation = TextAnalyticsWrapper._build_explanation( | |
original_score=entity.confidence_score, | |
entity_type=entity.category, | |
) | |
recognizer_results.append( | |
RecognizerResult( | |
entity_type=entity.category, | |
start=entity.offset, | |
end=entity.offset + len(entity.text), | |
score=entity.confidence_score, | |
analysis_explanation=analysis_explanation, | |
) | |
) | |
return recognizer_results | |
def _build_explanation( | |
original_score: float, entity_type: str | |
) -> AnalysisExplanation: | |
explanation = AnalysisExplanation( | |
recognizer=TextAnalyticsWrapper.__class__.__name__, | |
original_score=original_score, | |
textual_explanation=f"Identified as {entity_type} by Text Analytics", | |
) | |
return explanation | |
def load(self) -> None: | |
pass | |
if __name__ == "__main__": | |
import presidio_helpers | |
dotenv.load_dotenv() | |
text = """ | |
Here are a few example sentences we currently support: | |
Hello, my name is David Johnson and I live in Maine. | |
My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ. | |
On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1. | |
My passport: 191280342 and my phone number: (212) 555-1234. | |
This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544? | |
Kate's social security number is 078-05-1126. Her driver license? it is 1234567A. | |
""" | |
analyzer = presidio_helpers.analyzer_engine( | |
model_path="Azure Text Analytics PII", | |
ta_key=os.environ["TA_KEY"], | |
ta_endpoint=os.environ["TA_ENDPOINT"], | |
) | |
analyzer.analyze(text=text, language="en") | |