Added support for AWS Comprehend for PII identification. OCR and detection results now written to main output
f0f9378
import gradio as gr | |
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple | |
from tqdm import tqdm | |
from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine | |
from presidio_analyzer.nlp_engine import NlpArtifacts | |
def recognizer_result_from_dict(data: Dict) -> RecognizerResult: | |
""" | |
Create RecognizerResult from a dictionary. | |
:param data: e.g. { | |
"entity_type": "NAME", | |
"start": 24, | |
"end": 32, | |
"score": 0.8, | |
"recognition_metadata": None | |
} | |
:return: RecognizerResult | |
""" | |
entity_type = data.get("Type") | |
start = data.get("BeginOffset") | |
end = data.get("EndOffset") | |
score = data.get("Score") | |
analysis_explanation = None | |
recognition_metadata = None | |
return RecognizerResult(entity_type, start, end, score, analysis_explanation, recognition_metadata) | |
def analyze_iterator_custom( | |
self, | |
texts: Iterable[Union[str, bool, float, int]], | |
language: str, | |
list_length:int, | |
progress=gr.Progress(), | |
**kwargs, | |
) -> List[List[RecognizerResult]]: | |
""" | |
Analyze an iterable of strings. | |
:param texts: An list containing strings to be analyzed. | |
:param language: Input language | |
:param list_length: Length of the input list. | |
:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method. | |
""" | |
# validate types | |
texts = self._validate_types(texts) | |
# Process the texts as batch for improved performance | |
nlp_artifacts_batch: Iterator[ | |
Tuple[str, NlpArtifacts] | |
] = self.analyzer_engine.nlp_engine.process_batch( | |
texts=texts, language=language | |
) | |
list_results = [] | |
# Uncomment this if you want to show progress within a file | |
#for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"): | |
for text, nlp_artifacts in nlp_artifacts_batch: | |
results = self.analyzer_engine.analyze( | |
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs | |
) | |
list_results.append(results) | |
return list_results | |
def analyze_dict( | |
self, | |
input_dict: Dict[str, Union[Any, Iterable[Any]]], | |
language: str, | |
keys_to_skip: Optional[List[str]] = None, | |
**kwargs, | |
) -> Iterator[DictAnalyzerResult]: | |
""" | |
Analyze a dictionary of keys (strings) and values/iterable of values. | |
Non-string values are returned as is. | |
:param input_dict: The input dictionary for analysis | |
:param language: Input language | |
:param keys_to_skip: Keys to ignore during analysis | |
:param kwargs: Additional keyword arguments | |
for the `AnalyzerEngine.analyze` method. | |
Use this to pass arguments to the analyze method, | |
such as `ad_hoc_recognizers`, `context`, `return_decision_process`. | |
See `AnalyzerEngine.analyze` for the full list. | |
""" | |
context = [] | |
if "context" in kwargs: | |
context = kwargs["context"] | |
del kwargs["context"] | |
if not keys_to_skip: | |
keys_to_skip = [] | |
for key, value in input_dict.items(): | |
if not value or key in keys_to_skip: | |
yield DictAnalyzerResult(key=key, value=value, recognizer_results=[]) | |
continue # skip this key as requested | |
# Add the key as an additional context | |
specific_context = context[:] | |
specific_context.append(key) | |
if type(value) in (str, int, bool, float): | |
results: List[RecognizerResult] = self.analyzer_engine.analyze( | |
text=str(value), language=language, context=[key], **kwargs | |
) | |
elif isinstance(value, dict): | |
new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip) | |
results = self.analyze_dict( | |
input_dict=value, | |
language=language, | |
context=specific_context, | |
keys_to_skip=new_keys_to_skip, | |
**kwargs, | |
) | |
elif isinstance(value, Iterable): | |
# Recursively iterate nested dicts | |
list_length = len(value) | |
results: List[List[RecognizerResult]] = analyze_iterator_custom(self, | |
texts=value, | |
language=language, | |
context=specific_context, | |
list_length=list_length, | |
**kwargs, | |
) | |
else: | |
raise ValueError(f"type {type(value)} is unsupported.") | |
yield DictAnalyzerResult(key=key, value=value, recognizer_results=results) |