File size: 4,937 Bytes
7810536 f0f9378 7810536 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
from tqdm import tqdm
from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpArtifacts
def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
"""
Create RecognizerResult from a dictionary.
:param data: e.g. {
"entity_type": "NAME",
"start": 24,
"end": 32,
"score": 0.8,
"recognition_metadata": None
}
:return: RecognizerResult
"""
entity_type = data.get("Type")
start = data.get("BeginOffset")
end = data.get("EndOffset")
score = data.get("Score")
analysis_explanation = None
recognition_metadata = None
return RecognizerResult(entity_type, start, end, score, analysis_explanation, recognition_metadata)
def analyze_iterator_custom(
self,
texts: Iterable[Union[str, bool, float, int]],
language: str,
list_length:int,
progress=gr.Progress(),
**kwargs,
) -> List[List[RecognizerResult]]:
"""
Analyze an iterable of strings.
:param texts: An list containing strings to be analyzed.
:param language: Input language
:param list_length: Length of the input list.
:param kwargs: Additional parameters for the `AnalyzerEngine.analyze` method.
"""
# validate types
texts = self._validate_types(texts)
# Process the texts as batch for improved performance
nlp_artifacts_batch: Iterator[
Tuple[str, NlpArtifacts]
] = self.analyzer_engine.nlp_engine.process_batch(
texts=texts, language=language
)
list_results = []
# Uncomment this if you want to show progress within a file
#for text, nlp_artifacts in progress.tqdm(nlp_artifacts_batch, total = list_length, desc = "Analysing text for personal information", unit = "rows"):
for text, nlp_artifacts in nlp_artifacts_batch:
results = self.analyzer_engine.analyze(
text=str(text), nlp_artifacts=nlp_artifacts, language=language, **kwargs
)
list_results.append(results)
return list_results
def analyze_dict(
self,
input_dict: Dict[str, Union[Any, Iterable[Any]]],
language: str,
keys_to_skip: Optional[List[str]] = None,
**kwargs,
) -> Iterator[DictAnalyzerResult]:
"""
Analyze a dictionary of keys (strings) and values/iterable of values.
Non-string values are returned as is.
:param input_dict: The input dictionary for analysis
:param language: Input language
:param keys_to_skip: Keys to ignore during analysis
:param kwargs: Additional keyword arguments
for the `AnalyzerEngine.analyze` method.
Use this to pass arguments to the analyze method,
such as `ad_hoc_recognizers`, `context`, `return_decision_process`.
See `AnalyzerEngine.analyze` for the full list.
"""
context = []
if "context" in kwargs:
context = kwargs["context"]
del kwargs["context"]
if not keys_to_skip:
keys_to_skip = []
for key, value in input_dict.items():
if not value or key in keys_to_skip:
yield DictAnalyzerResult(key=key, value=value, recognizer_results=[])
continue # skip this key as requested
# Add the key as an additional context
specific_context = context[:]
specific_context.append(key)
if type(value) in (str, int, bool, float):
results: List[RecognizerResult] = self.analyzer_engine.analyze(
text=str(value), language=language, context=[key], **kwargs
)
elif isinstance(value, dict):
new_keys_to_skip = self._get_nested_keys_to_skip(key, keys_to_skip)
results = self.analyze_dict(
input_dict=value,
language=language,
context=specific_context,
keys_to_skip=new_keys_to_skip,
**kwargs,
)
elif isinstance(value, Iterable):
# Recursively iterate nested dicts
list_length = len(value)
results: List[List[RecognizerResult]] = analyze_iterator_custom(self,
texts=value,
language=language,
context=specific_context,
list_length=list_length,
**kwargs,
)
else:
raise ValueError(f"type {type(value)} is unsupported.")
yield DictAnalyzerResult(key=key, value=value, recognizer_results=results) |