presidio commited on
Commit
547518c
·
1 Parent(s): 6f1792a

Upload 8 files

Browse files
Dockerfile CHANGED
@@ -13,6 +13,7 @@ COPY ./requirements.txt /code/requirements.txt
13
  RUN pip3 install -r requirements.txt
14
  RUN pip3 install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
15
  RUN pip3 install https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
 
16
  EXPOSE 7860
17
 
18
  COPY . /code
 
13
  RUN pip3 install -r requirements.txt
14
  RUN pip3 install https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
15
  RUN pip3 install https://huggingface.co/spacy/en_core_web_lg/resolve/main/en_core_web_lg-any-py3-none-any.whl
16
+
17
  EXPOSE 7860
18
 
19
  COPY . /code
demo_text.txt CHANGED
@@ -1,4 +1,4 @@
1
- Here are a few examples sentences we currently support:
2
 
3
  Hello, my name is David Johnson and I live in Maine.
4
  My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
 
1
+ Here are a few example sentences we currently support:
2
 
3
  Hello, my name is David Johnson and I live in Maine.
4
  My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
flair_recognizer.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Optional, List, Tuple, Set
3
+
4
+ from presidio_analyzer import (
5
+ RecognizerResult,
6
+ EntityRecognizer,
7
+ AnalysisExplanation,
8
+ )
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+
11
+ from flair.data import Sentence
12
+ from flair.models import SequenceTagger
13
+
14
+
15
+ logger = logging.getLogger("presidio-analyzer")
16
+
17
+
18
+ class FlairRecognizer(EntityRecognizer):
19
+ """
20
+ Wrapper for a flair model, if needed to be used within Presidio Analyzer.
21
+
22
+ :example:
23
+ >from presidio_analyzer import AnalyzerEngine, RecognizerRegistry
24
+
25
+ >flair_recognizer = FlairRecognizer()
26
+
27
+ >registry = RecognizerRegistry()
28
+ >registry.add_recognizer(flair_recognizer)
29
+
30
+ >analyzer = AnalyzerEngine(registry=registry)
31
+
32
+ >results = analyzer.analyze(
33
+ > "My name is Christopher and I live in Irbid.",
34
+ > language="en",
35
+ > return_decision_process=True,
36
+ >)
37
+ >for result in results:
38
+ > print(result)
39
+ > print(result.analysis_explanation)
40
+
41
+
42
+ """
43
+
44
+ ENTITIES = [
45
+ "LOCATION",
46
+ "PERSON",
47
+ "ORGANIZATION",
48
+ # "MISCELLANEOUS" # - There are no direct correlation with Presidio entities.
49
+ ]
50
+
51
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
52
+
53
+ CHECK_LABEL_GROUPS = [
54
+ ({"LOCATION"}, {"LOC", "LOCATION"}),
55
+ ({"PERSON"}, {"PER", "PERSON"}),
56
+ ({"ORGANIZATION"}, {"ORG"}),
57
+ # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
58
+ ]
59
+
60
+ MODEL_LANGUAGES = {
61
+ "en": "flair/ner-english-large"
62
+ }
63
+
64
+ PRESIDIO_EQUIVALENCES = {
65
+ "PER": "PERSON",
66
+ "LOC": "LOCATION",
67
+ "ORG": "ORGANIZATION",
68
+ # 'MISC': 'MISCELLANEOUS' # - Probably not PII
69
+ }
70
+
71
+ def __init__(
72
+ self,
73
+ supported_language: str = "en",
74
+ supported_entities: Optional[List[str]] = None,
75
+ check_label_groups: Optional[Tuple[Set, Set]] = None,
76
+ model: SequenceTagger = None,
77
+ ):
78
+ self.check_label_groups = (
79
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
80
+ )
81
+
82
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
83
+ self.model = (
84
+ model
85
+ if model
86
+ else SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
87
+ )
88
+
89
+ super().__init__(
90
+ supported_entities=supported_entities,
91
+ supported_language=supported_language,
92
+ name="Flair Analytics",
93
+ )
94
+
95
+ def load(self) -> None:
96
+ """Load the model, not used. Model is loaded during initialization."""
97
+ pass
98
+
99
+ def get_supported_entities(self) -> List[str]:
100
+ """
101
+ Return supported entities by this model.
102
+
103
+ :return: List of the supported entities.
104
+ """
105
+ return self.supported_entities
106
+
107
+ # Class to use Flair with Presidio as an external recognizer.
108
+ def analyze(
109
+ self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None
110
+ ) -> List[RecognizerResult]:
111
+ """
112
+ Analyze text using Text Analytics.
113
+
114
+ :param text: The text for analysis.
115
+ :param entities: Not working properly for this recognizer.
116
+ :param nlp_artifacts: Not used by this recognizer.
117
+ :param language: Text language. Supported languages in MODEL_LANGUAGES
118
+ :return: The list of Presidio RecognizerResult constructed from the recognized
119
+ Flair detections.
120
+ """
121
+
122
+ results = []
123
+
124
+ sentences = Sentence(text)
125
+ self.model.predict(sentences)
126
+
127
+ # If there are no specific list of entities, we will look for all of it.
128
+ if not entities:
129
+ entities = self.supported_entities
130
+
131
+ for entity in entities:
132
+ if entity not in self.supported_entities:
133
+ continue
134
+
135
+ for ent in sentences.get_spans("ner"):
136
+ if not self.__check_label(
137
+ entity, ent.labels[0].value, self.check_label_groups
138
+ ):
139
+ continue
140
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
141
+ ent.labels[0].value
142
+ )
143
+ explanation = self.build_flair_explanation(
144
+ round(ent.score, 2), textual_explanation
145
+ )
146
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
147
+
148
+ results.append(flair_result)
149
+
150
+ return results
151
+
152
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
153
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
154
+ flair_score = round(entity.score, 2)
155
+
156
+ flair_results = RecognizerResult(
157
+ entity_type=entity_type,
158
+ start=entity.start_position,
159
+ end=entity.end_position,
160
+ score=flair_score,
161
+ analysis_explanation=explanation,
162
+ )
163
+
164
+ return flair_results
165
+
166
+ def build_flair_explanation(
167
+ self, original_score: float, explanation: str
168
+ ) -> AnalysisExplanation:
169
+ """
170
+ Create explanation for why this result was detected.
171
+
172
+ :param original_score: Score given by this recognizer
173
+ :param explanation: Explanation string
174
+ :return:
175
+ """
176
+ explanation = AnalysisExplanation(
177
+ recognizer=self.__class__.__name__,
178
+ original_score=original_score,
179
+ textual_explanation=explanation,
180
+ )
181
+ return explanation
182
+
183
+ @staticmethod
184
+ def __check_label(
185
+ entity: str, label: str, check_label_groups: Tuple[Set, Set]
186
+ ) -> bool:
187
+ return any(
188
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
189
+ )
flair_test.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import generic wrappers
2
+ from transformers import AutoModel, AutoTokenizer
3
+
4
+
5
+ if __name__ == "__main__":
6
+
7
+ from flair.data import Sentence
8
+ from flair.models import SequenceTagger
9
+
10
+ # load tagger
11
+ tagger = SequenceTagger.load("flair/ner-english-large")
12
+
13
+ # make example sentence
14
+ sentence = Sentence("George Washington went to Washington")
15
+
16
+ # predict NER tags
17
+ tagger.predict(sentence)
18
+
19
+ # print sentence
20
+ print(sentence)
21
+
22
+ # print predicted NER spans
23
+ print('The following NER tags are found:')
24
+ # iterate over entities and print
25
+ for entity in sentence.get_spans('ner'):
26
+ print(entity)
27
+
openai_fake_data_generator.py CHANGED
@@ -1,37 +1,33 @@
1
  import openai
2
- frmo typing import List
3
- from presidio_analyzer import RecognizerResult
4
- from presidio_anonymizer import AnonymizerEngine
5
 
6
-
7
- def set_openai_key(openai_key:string):
8
  """Set the OpenAI API key.
9
  :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
10
  """
11
  openai.api_key = openai_key
12
 
13
 
14
- def call_completion_model(prompt:str, model:str="text-davinci-003", max_tokens:int=512) ->str:
 
 
15
  """Creates a request for the OpenAI Completion service and returns the response.
16
-
17
  :param prompt: The prompt for the completion model
18
  :param model: OpenAI model name
19
- :param temperature: Model's temperature parameter
20
  """
21
 
22
  response = openai.Completion.create(
23
- model=model,
24
- prompt= prompt,
25
- max_tokens=max_tokens
26
  )
27
 
28
- return response['choices'][0].text
29
 
30
 
31
  def create_prompt(anonymized_text: str) -> str:
32
  """
33
  Create the prompt with instructions to GPT-3.
34
-
35
  :param anonymized_text: Text with placeholders instead of PII values, e.g. My name is <PERSON>.
36
  """
37
 
 
1
  import openai
 
 
 
2
 
3
+ def set_openai_key(openai_key: str):
 
4
  """Set the OpenAI API key.
5
  :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
6
  """
7
  openai.api_key = openai_key
8
 
9
 
10
+ def call_completion_model(
11
+ prompt: str, model: str = "text-davinci-003", max_tokens: int = 512
12
+ ) -> str:
13
  """Creates a request for the OpenAI Completion service and returns the response.
14
+
15
  :param prompt: The prompt for the completion model
16
  :param model: OpenAI model name
17
+ :param max_tokens: Model's max_tokens parameter
18
  """
19
 
20
  response = openai.Completion.create(
21
+ model=model, prompt=prompt, max_tokens=max_tokens
 
 
22
  )
23
 
24
+ return response["choices"][0].text
25
 
26
 
27
  def create_prompt(anonymized_text: str) -> str:
28
  """
29
  Create the prompt with instructions to GPT-3.
30
+
31
  :param anonymized_text: Text with placeholders instead of PII values, e.g. My name is <PERSON>.
32
  """
33
 
presidio_streamlit.py CHANGED
@@ -1,5 +1,5 @@
1
  """Streamlit app for Presidio."""
2
-
3
  from json import JSONEncoder
4
  from typing import List
5
 
@@ -12,13 +12,18 @@ from presidio_analyzer.nlp_engine import NlpEngineProvider
12
  from presidio_anonymizer import AnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig
14
 
 
15
  from transformers_rec import (
16
  STANFORD_COFIGURATION,
17
  TransformersRecognizer,
18
  BERT_DEID_CONFIGURATION,
19
  )
20
 
21
- from openai_fake_data_generator import *
 
 
 
 
22
 
23
 
24
  # Helper methods
@@ -37,15 +42,26 @@ def analyzer_engine(model_path: str):
37
 
38
  # Set up NLP Engine according to the model of choice
39
  if model_path == "en_core_web_lg":
40
-
 
41
  nlp_configuration = {
42
  "nlp_engine_name": "spacy",
43
  "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
44
  }
 
 
 
 
 
 
 
 
45
  else:
 
 
46
  # Using a small spaCy model + a HF NER model
47
  transformers_recognizer = TransformersRecognizer(model_path=model_path)
48
-
49
  if model_path == "StanfordAIMI/stanford-deidentifier-base":
50
  transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
51
  elif model_path == "obi/deid_roberta_i2b2":
@@ -101,6 +117,7 @@ def anonymize(text: str, analyze_results: List[RecognizerResult]):
101
  "from_end": False,
102
  }
103
 
 
104
  elif st_operator == "encrypt":
105
  operator_config = {"key": st_encrypt_key}
106
  elif st_operator == "highlight":
@@ -108,8 +125,11 @@ def anonymize(text: str, analyze_results: List[RecognizerResult]):
108
  else:
109
  operator_config = None
110
 
 
111
  if st_operator == "highlight":
112
  operator = "custom"
 
 
113
  else:
114
  operator = st_operator
115
 
@@ -139,17 +159,39 @@ def annotate(text: str, analyze_results: List[RecognizerResult]):
139
  tokens.append(text[: res.start])
140
 
141
  # append entity text and entity type
142
- tokens.append((text[res.start: res.end], res.entity_type))
143
 
144
  # if another entity coming i.e. we're not at the last results element, add text up to next entity
145
  if i != len(results) - 1:
146
- tokens.append(text[res.end: results[i + 1].start])
147
  # if no more entities coming, add all remaining text
148
  else:
149
- tokens.append(text[res.end:])
150
  return tokens
151
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  st.set_page_config(page_title="Presidio demo", layout="wide")
154
 
155
  # Sidebar
@@ -175,20 +217,35 @@ st.sidebar.markdown(
175
  )
176
 
177
  st_model = st.sidebar.selectbox(
178
- "NER model",
179
  [
180
  "StanfordAIMI/stanford-deidentifier-base",
181
  "obi/deid_roberta_i2b2",
 
182
  "en_core_web_lg",
183
  ],
184
  index=1,
 
 
 
 
185
  )
186
  st.sidebar.markdown("> Note: Models might take some time to download. ")
187
 
188
  st_operator = st.sidebar.selectbox(
189
  "De-identification approach",
190
- ["redact", "replace", "mask", "hash", "encrypt", "highlight"],
191
  index=1,
 
 
 
 
 
 
 
 
 
 
192
  )
193
 
194
  if st_operator == "mask":
@@ -198,19 +255,36 @@ if st_operator == "mask":
198
  st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
199
  elif st_operator == "encrypt":
200
  st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
201
-
 
 
 
 
 
 
 
 
 
 
 
202
  st_threshold = st.sidebar.slider(
203
- label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35
 
 
 
 
204
  )
205
 
206
  st_return_decision_process = st.sidebar.checkbox(
207
- "Add analysis explanations to findings", value=False
 
208
  )
209
 
210
  st_entities = st.sidebar.multiselect(
211
  label="Which entities to look for?",
212
  options=get_supported_entities(),
213
  default=list(get_supported_entities()),
 
214
  )
215
 
216
  # Main panel
@@ -242,11 +316,21 @@ st_analyze_results = analyze(
242
  )
243
 
244
  # After
245
- if st_operator != "highlight":
246
  with col2:
247
  st.subheader(f"Output")
248
  st_anonymize_results = anonymize(st_text, st_analyze_results)
249
  st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
 
 
 
 
 
 
 
 
 
 
250
  else:
251
  st.subheader("Highlighted")
252
  annotated_tokens = annotate(st_text, st_analyze_results)
@@ -269,7 +353,7 @@ st.subheader(
269
  )
270
  if st_analyze_results:
271
  df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
272
- df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
273
 
274
  df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
275
  {
@@ -281,7 +365,7 @@ if st_analyze_results:
281
  },
282
  axis=1,
283
  )
284
- df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
285
  if st_return_decision_process:
286
  analysis_explanation_df = pd.DataFrame.from_records(
287
  [r.analysis_explanation.to_dict() for r in st_analyze_results]
 
1
  """Streamlit app for Presidio."""
2
+ import os
3
  from json import JSONEncoder
4
  from typing import List
5
 
 
12
  from presidio_anonymizer import AnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig
14
 
15
+ from flair_recognizer import FlairRecognizer
16
  from transformers_rec import (
17
  STANFORD_COFIGURATION,
18
  TransformersRecognizer,
19
  BERT_DEID_CONFIGURATION,
20
  )
21
 
22
+ from openai_fake_data_generator import (
23
+ set_openai_key,
24
+ call_completion_model,
25
+ create_prompt,
26
+ )
27
 
28
 
29
  # Helper methods
 
42
 
43
  # Set up NLP Engine according to the model of choice
44
  if model_path == "en_core_web_lg":
45
+ if not spacy.util.is_package("en_core_web_lg"):
46
+ spacy.cli.download("en_core_web_lg")
47
  nlp_configuration = {
48
  "nlp_engine_name": "spacy",
49
  "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
50
  }
51
+ elif model_path == "flair/ner-english-large":
52
+ flair_recognizer = FlairRecognizer()
53
+ nlp_configuration = {
54
+ "nlp_engine_name": "spacy",
55
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
56
+ }
57
+ registry.add_recognizer(flair_recognizer)
58
+ registry.remove_recognizer("SpacyRecognizer")
59
  else:
60
+ if not spacy.util.is_package("en_core_web_sm"):
61
+ spacy.cli.download("en_core_web_sm")
62
  # Using a small spaCy model + a HF NER model
63
  transformers_recognizer = TransformersRecognizer(model_path=model_path)
64
+ registry.remove_recognizer("SpacyRecognizer")
65
  if model_path == "StanfordAIMI/stanford-deidentifier-base":
66
  transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
67
  elif model_path == "obi/deid_roberta_i2b2":
 
117
  "from_end": False,
118
  }
119
 
120
+ # Define operator config
121
  elif st_operator == "encrypt":
122
  operator_config = {"key": st_encrypt_key}
123
  elif st_operator == "highlight":
 
125
  else:
126
  operator_config = None
127
 
128
+ # Change operator if needed as intermediate step
129
  if st_operator == "highlight":
130
  operator = "custom"
131
+ elif st_operator == "synthesize":
132
+ operator = "replace"
133
  else:
134
  operator = st_operator
135
 
 
159
  tokens.append(text[: res.start])
160
 
161
  # append entity text and entity type
162
+ tokens.append((text[res.start : res.end], res.entity_type))
163
 
164
  # if another entity coming i.e. we're not at the last results element, add text up to next entity
165
  if i != len(results) - 1:
166
+ tokens.append(text[res.end : results[i + 1].start])
167
  # if no more entities coming, add all remaining text
168
  else:
169
+ tokens.append(text[res.end :])
170
  return tokens
171
 
172
 
173
+ def create_fake_data(
174
+ text: str,
175
+ analyze_results: List[RecognizerResult],
176
+ openai_key: str,
177
+ openai_model_name: str,
178
+ ):
179
+ """Creates a synthetic version of the text using OpenAI APIs"""
180
+ if not openai_key:
181
+ return "Please provide your OpenAI key"
182
+ results = anonymize(text, analyze_results)
183
+ set_openai_key(openai_key)
184
+ prompt = create_prompt(results.text)
185
+ fake = call_openai_api(prompt, openai_model_name)
186
+ return fake
187
+
188
+
189
+ @st.cache_data
190
+ def call_openai_api(prompt: str, openai_model_name: str) -> str:
191
+ fake_data = call_completion_model(prompt, model=openai_model_name)
192
+ return fake_data
193
+
194
+
195
  st.set_page_config(page_title="Presidio demo", layout="wide")
196
 
197
  # Sidebar
 
217
  )
218
 
219
  st_model = st.sidebar.selectbox(
220
+ "NER model for PII detection",
221
  [
222
  "StanfordAIMI/stanford-deidentifier-base",
223
  "obi/deid_roberta_i2b2",
224
+ "flair/ner-english-large",
225
  "en_core_web_lg",
226
  ],
227
  index=1,
228
+ help="""
229
+ Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
230
+ Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair.
231
+ """,
232
  )
233
  st.sidebar.markdown("> Note: Models might take some time to download. ")
234
 
235
  st_operator = st.sidebar.selectbox(
236
  "De-identification approach",
237
+ ["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"],
238
  index=1,
239
+ help="""
240
+ Select which manipulation to the text is requested after PII has been identified.\n
241
+ - Redact: Completely remove the PII text\n
242
+ - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
243
+ - Synthesize: Replace with fake values (requires an OpenAI key)\n
244
+ - Highlight: Shows the original text with PII highlighted in colors\n
245
+ - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n
246
+ - Hash: Replaces with the hash of the PII string\n
247
+ - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
248
+ """,
249
  )
250
 
251
  if st_operator == "mask":
 
255
  st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
256
  elif st_operator == "encrypt":
257
  st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
258
+ elif st_operator == "synthesize":
259
+ st_openai_key = st.sidebar.text_input(
260
+ "OPENAI_KEY",
261
+ value=os.getenv("OPENAI_KEY", default=""),
262
+ help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
263
+ type="password",
264
+ )
265
+ st_openai_model = st.sidebar.text_input(
266
+ "OpenAI model for text synthesis",
267
+ value="text-davinci-003",
268
+ help="See more here: https://platform.openai.com/docs/models/",
269
+ )
270
  st_threshold = st.sidebar.slider(
271
+ label="Acceptance threshold",
272
+ min_value=0.0,
273
+ max_value=1.0,
274
+ value=0.35,
275
+ help="Define the threshold for accepting a detection as PII. See more here: ",
276
  )
277
 
278
  st_return_decision_process = st.sidebar.checkbox(
279
+ "Add analysis explanations to findings", value=False,
280
+ help="Add the decision process to the output table. More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/"
281
  )
282
 
283
  st_entities = st.sidebar.multiselect(
284
  label="Which entities to look for?",
285
  options=get_supported_entities(),
286
  default=list(get_supported_entities()),
287
+ help="Limit the list of PII entities detected. This list is dynamic and based on the NER model and registered recognizers. More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/"
288
  )
289
 
290
  # Main panel
 
316
  )
317
 
318
  # After
319
+ if st_operator not in ("highlight", "synthesize"):
320
  with col2:
321
  st.subheader(f"Output")
322
  st_anonymize_results = anonymize(st_text, st_analyze_results)
323
  st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
324
+ elif st_operator == "synthesize":
325
+ with col2:
326
+ st.subheader(f"OpenAI Generated output")
327
+ fake_data = create_fake_data(
328
+ st_text,
329
+ st_analyze_results,
330
+ openai_key=st_openai_key,
331
+ openai_model_name=st_openai_model,
332
+ )
333
+ st.text_area(label="Synthetic data", value=fake_data, height=400)
334
  else:
335
  st.subheader("Highlighted")
336
  annotated_tokens = annotate(st_text, st_analyze_results)
 
353
  )
354
  if st_analyze_results:
355
  df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
356
+ df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
357
 
358
  df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
359
  {
 
365
  },
366
  axis=1,
367
  )
368
+ df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
369
  if st_return_decision_process:
370
  analysis_explanation_df = pd.DataFrame.from_records(
371
  [r.analysis_explanation.to_dict() for r in st_analyze_results]
requirements.txt CHANGED
@@ -4,4 +4,6 @@ streamlit
4
  pandas
5
  st-annotated-text
6
  torch
7
- transformers
 
 
 
4
  pandas
5
  st-annotated-text
6
  torch
7
+ transformers
8
+ flair
9
+ openai