omri374 commited on
Commit
7172378
1 Parent(s): f5f7ba5

Upload 10 files

Browse files
openai_fake_data_generator.py CHANGED
@@ -1,25 +1,50 @@
 
 
 
1
  import openai
 
 
 
 
 
 
 
 
2
 
3
- def set_openai_key(openai_key: str):
 
4
  """Set the OpenAI API key.
5
- :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
 
6
  """
7
- openai.api_key = openai_key
 
 
 
 
8
 
9
 
10
  def call_completion_model(
11
- prompt: str, model: str = "text-davinci-003", max_tokens: int = 512
 
 
 
12
  ) -> str:
13
  """Creates a request for the OpenAI Completion service and returns the response.
14
 
15
  :param prompt: The prompt for the completion model
16
  :param model: OpenAI model name
17
  :param max_tokens: Model's max_tokens parameter
 
18
  """
19
-
20
- response = openai.Completion.create(
21
- model=model, prompt=prompt, max_tokens=max_tokens
22
- )
 
 
 
 
23
 
24
  return response["choices"][0].text
25
 
 
1
+ from collections import namedtuple
2
+ from typing import Optional
3
+
4
  import openai
5
+ import logging
6
+
7
+ logger = logging.getLogger("presidio-streamlit")
8
+
9
+ OpenAIParams = namedtuple(
10
+ "open_ai_params",
11
+ ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
12
+ )
13
 
14
+
15
+ def set_openai_params(openai_params: OpenAIParams):
16
  """Set the OpenAI API key.
17
+ :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
18
+ The latter only relate to Azure OpenAI deployments.
19
  """
20
+ openai.api_key = openai_params.openai_key
21
+ openai.api_version = openai_params.api_version
22
+ if openai_params.api_base:
23
+ openai.api_base = openai_params.api_base
24
+ openai.api_type = openai_params.api_type
25
 
26
 
27
  def call_completion_model(
28
+ prompt: str,
29
+ model: str = "text-davinci-003",
30
+ max_tokens: int = 512,
31
+ deployment_id: Optional[str] = None,
32
  ) -> str:
33
  """Creates a request for the OpenAI Completion service and returns the response.
34
 
35
  :param prompt: The prompt for the completion model
36
  :param model: OpenAI model name
37
  :param max_tokens: Model's max_tokens parameter
38
+ :param deployment_id: Azure OpenAI deployment ID
39
  """
40
+ if deployment_id:
41
+ response = openai.Completion.create(
42
+ deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
43
+ )
44
+ else:
45
+ response = openai.Completion.create(
46
+ model=model, prompt=prompt, max_tokens=max_tokens
47
+ )
48
 
49
  return response["choices"][0].text
50
 
presidio_helpers.py CHANGED
@@ -2,22 +2,24 @@
2
  Helper methods for the Presidio Streamlit app
3
  """
4
  from typing import List, Optional, Tuple
5
-
6
  import streamlit as st
7
  from presidio_analyzer import (
8
  AnalyzerEngine,
9
  RecognizerResult,
10
  RecognizerRegistry,
11
  PatternRecognizer,
 
12
  )
13
  from presidio_analyzer.nlp_engine import NlpEngine
14
  from presidio_anonymizer import AnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig
16
 
17
  from openai_fake_data_generator import (
18
- set_openai_key,
19
  call_completion_model,
20
  create_prompt,
 
21
  )
22
  from presidio_nlp_engine_config import (
23
  create_nlp_engine_with_spacy,
@@ -26,6 +28,8 @@ from presidio_nlp_engine_config import (
26
  create_nlp_engine_with_azure_text_analytics,
27
  )
28
 
 
 
29
 
30
  @st.cache_resource
31
  def nlp_engine_and_registry(
@@ -109,6 +113,11 @@ def analyze(
109
  kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
110
  del kwargs["deny_list"]
111
 
 
 
 
 
 
112
  return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
113
  **kwargs
114
  )
@@ -200,22 +209,30 @@ def annotate(text: str, analyze_results: List[RecognizerResult]):
200
  def create_fake_data(
201
  text: str,
202
  analyze_results: List[RecognizerResult],
203
- openai_key: str,
204
- openai_model_name: str,
205
  ):
206
  """Creates a synthetic version of the text using OpenAI APIs"""
207
- if not openai_key:
208
  return "Please provide your OpenAI key"
209
  results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
210
- set_openai_key(openai_key)
211
  prompt = create_prompt(results.text)
212
- fake = call_openai_api(prompt, openai_model_name)
 
 
 
 
 
213
  return fake
214
 
215
 
216
  @st.cache_data
217
- def call_openai_api(prompt: str, openai_model_name: str) -> str:
218
- fake_data = call_completion_model(prompt, model=openai_model_name)
 
 
 
 
219
  return fake_data
220
 
221
 
@@ -225,6 +242,19 @@ def create_ad_hoc_deny_list_recognizer(
225
  if not deny_list:
226
  return None
227
 
228
- deny_list_recognizer = PatternRecognizer(supported_entity="GENERIC_PII", deny_list=deny_list)
229
- print(deny_list_recognizer.patterns)
 
230
  return deny_list_recognizer
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  Helper methods for the Presidio Streamlit app
3
  """
4
  from typing import List, Optional, Tuple
5
+ import logging
6
  import streamlit as st
7
  from presidio_analyzer import (
8
  AnalyzerEngine,
9
  RecognizerResult,
10
  RecognizerRegistry,
11
  PatternRecognizer,
12
+ Pattern,
13
  )
14
  from presidio_analyzer.nlp_engine import NlpEngine
15
  from presidio_anonymizer import AnonymizerEngine
16
  from presidio_anonymizer.entities import OperatorConfig
17
 
18
  from openai_fake_data_generator import (
19
+ set_openai_params,
20
  call_completion_model,
21
  create_prompt,
22
+ OpenAIParams,
23
  )
24
  from presidio_nlp_engine_config import (
25
  create_nlp_engine_with_spacy,
 
28
  create_nlp_engine_with_azure_text_analytics,
29
  )
30
 
31
+ logger = logging.getLogger("presidio-streamlit")
32
+
33
 
34
  @st.cache_resource
35
  def nlp_engine_and_registry(
 
113
  kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
114
  del kwargs["deny_list"]
115
 
116
+ if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
117
+ ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
118
+ kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
119
+ del kwargs["regex_params"]
120
+
121
  return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
122
  **kwargs
123
  )
 
209
  def create_fake_data(
210
  text: str,
211
  analyze_results: List[RecognizerResult],
212
+ openai_params: OpenAIParams,
 
213
  ):
214
  """Creates a synthetic version of the text using OpenAI APIs"""
215
+ if not openai_params.openai_key:
216
  return "Please provide your OpenAI key"
217
  results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
218
+ set_openai_params(openai_params)
219
  prompt = create_prompt(results.text)
220
+ print(f"Prompt: {prompt}")
221
+ fake = call_openai_api(
222
+ prompt=prompt,
223
+ openai_model_name=openai_params.model,
224
+ openai_deployment_name=openai_params.deployment_name,
225
+ )
226
  return fake
227
 
228
 
229
  @st.cache_data
230
+ def call_openai_api(
231
+ prompt: str, openai_model_name: str, openai_deployment_name: Optional[str] = None
232
+ ) -> str:
233
+ fake_data = call_completion_model(
234
+ prompt, model=openai_model_name, deployment_id=openai_deployment_name
235
+ )
236
  return fake_data
237
 
238
 
 
242
  if not deny_list:
243
  return None
244
 
245
+ deny_list_recognizer = PatternRecognizer(
246
+ supported_entity="GENERIC_PII", deny_list=deny_list
247
+ )
248
  return deny_list_recognizer
249
+
250
+
251
+ def create_ad_hoc_regex_recognizer(
252
+ regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
253
+ ) -> Optional[PatternRecognizer]:
254
+ if not regex:
255
+ return None
256
+ pattern = Pattern(name="Regex pattern", regex=regex, score=score)
257
+ regex_recognizer = PatternRecognizer(
258
+ supported_entity=entity_type, patterns=[pattern], context=context
259
+ )
260
+ return regex_recognizer
presidio_nlp_engine_config.py CHANGED
@@ -1,9 +1,11 @@
1
  from typing import Tuple
2
-
3
  import spacy
4
  from presidio_analyzer import RecognizerRegistry
5
  from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
6
 
 
 
7
 
8
  def create_nlp_engine_with_spacy(
9
  model_path: str,
 
1
  from typing import Tuple
2
+ import logging
3
  import spacy
4
  from presidio_analyzer import RecognizerRegistry
5
  from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
6
 
7
+ logger = logging.getLogger("presidio-streamlit")
8
+
9
 
10
  def create_nlp_engine_with_spacy(
11
  model_path: str,
presidio_streamlit.py CHANGED
@@ -1,13 +1,15 @@
1
  """Streamlit app for Presidio."""
 
2
  import os
3
 
4
  import pandas as pd
5
  import streamlit as st
6
  import streamlit.components.v1 as components
7
-
8
  from annotated_text import annotated_text
9
  from streamlit_tags import st_tags
10
 
 
11
  from presidio_helpers import (
12
  get_supported_entities,
13
  analyze,
@@ -18,29 +20,30 @@ from presidio_helpers import (
18
  nlp_engine_and_registry,
19
  )
20
 
21
- st.set_page_config(page_title="Presidio demo", layout="wide")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Sidebar
24
  st.sidebar.header(
25
  """
26
- PII De-Identification with Microsoft Presidio
27
  """
28
  )
29
 
30
- st.sidebar.info(
31
- "Presidio is an open source customizable framework for PII detection and de-identification\n"
32
- "[Code](https://aka.ms/presidio) | "
33
- "[Tutorial](https://microsoft.github.io/presidio/tutorial/) | "
34
- "[Installation](https://microsoft.github.io/presidio/installation/) | "
35
- "[FAQ](https://microsoft.github.io/presidio/faq/)",
36
- icon="ℹ️",
37
- )
38
-
39
- st.sidebar.markdown(
40
- "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa
41
- "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
42
- "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
43
- )
44
 
45
  model_help_text = """
46
  Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
@@ -48,51 +51,56 @@ model_help_text = """
48
  as well as service such as Azure Text Analytics PII.
49
  """
50
  st_ta_key = st_ta_endpoint = ""
51
- st_model = "en_core_web_lg"
52
 
53
- st_model_package = st.sidebar.selectbox(
 
 
 
 
 
 
 
 
 
 
 
54
  "NER model package",
55
- ["spaCy", "flair", "HuggingFace", "Azure Text Analytics"],
56
  index=2,
57
- help="Select the NLP package to use for PII detection",
58
  )
59
 
60
- if st_model_package == "spaCy":
61
- st_model = st.sidebar.selectbox(
62
- "NER model for PII detection",
63
- ["en_core_web_lg", "en_core_web_trf", "Other"],
64
- help=model_help_text,
65
- )
66
- elif st_model_package == "HuggingFace":
67
- st_model = st.sidebar.selectbox(
68
- "NER model for PII detection",
69
- ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base", "Other"],
70
- help=model_help_text,
71
- )
72
- elif st_model_package == "flair":
73
- st_model = st.sidebar.selectbox(
74
- "NER model for PII detection",
75
- ["flair/ner-english-large", "Other"],
76
- help=model_help_text,
77
- )
78
- elif st_model_package == "Azure Text Analytics":
79
- st_model = st.sidebar.selectbox(
80
- "NER model for PII detection",
81
- ["Azure Text Analytics PII"],
82
- help=model_help_text,
83
- )
84
- st_ta_key = st.sidebar.text_input("Text Analytics Key", type="password")
85
- st_ta_endpoint = st.sidebar.text_input("Text Analytics Endpoint")
86
 
87
  if st_model == "Other":
88
- st_model = st.sidebar.text_input(
89
- f"NER model name for package {st_model_package}", value=""
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
 
92
 
93
  st.sidebar.warning("Note: Models might take some time to download. ")
94
 
95
  analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
 
96
 
97
  st_operator = st.sidebar.selectbox(
98
  "De-identification approach",
@@ -112,8 +120,10 @@ st_operator = st.sidebar.selectbox(
112
  st_mask_char = "*"
113
  st_number_of_chars = 15
114
  st_encrypt_key = "WmZq4t7w!z%C&F)J"
115
- st_openai_key = ""
116
- st_openai_model = "text-davinci-003"
 
 
117
 
118
  if st_operator == "mask":
119
  st_number_of_chars = st.sidebar.number_input(
@@ -125,6 +135,22 @@ if st_operator == "mask":
125
  elif st_operator == "encrypt":
126
  st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
127
  elif st_operator == "synthesize":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  st_openai_key = st.sidebar.text_input(
129
  "OPENAI_KEY",
130
  value=os.getenv("OPENAI_KEY", default=""),
@@ -133,9 +159,21 @@ elif st_operator == "synthesize":
133
  )
134
  st_openai_model = st.sidebar.text_input(
135
  "OpenAI model for text synthesis",
136
- value=st_openai_model,
137
  help="See more here: https://platform.openai.com/docs/models/",
138
  )
 
 
 
 
 
 
 
 
 
 
 
 
139
  st_threshold = st.sidebar.slider(
140
  label="Acceptance threshold",
141
  min_value=0.0,
@@ -153,24 +191,60 @@ st_return_decision_process = st.sidebar.checkbox(
153
 
154
  # Allow and deny lists
155
  st_deny_allow_expander = st.sidebar.expander(
156
- "Allow and deny lists",
157
  expanded=False,
158
  )
159
 
160
  with st_deny_allow_expander:
161
- st_allow_list = st_tags(label="Add words to the allow list", text="Enter word and press enter.")
162
- st.caption('Allow lists contain words that are not considered PII, but are detected as such.')
 
 
 
 
163
 
164
- st_deny_list = st_tags(label="Add words to the deny list", text="Enter word and press enter.")
165
- st.caption("Deny lists contain words that are considered PII, but are not detected as such.")
 
 
 
 
166
  # Main panel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  analyzer_load_state = st.info("Starting Presidio analyzer...")
168
  nlp_engine, registry = nlp_engine_and_registry(*analyzer_params)
169
 
170
- analyzer = analyzer_engine(*analyzer_params)
171
  analyzer_load_state.empty()
172
 
173
-
174
  # Choose entities
175
  st_entities_expander = st.sidebar.expander("Choose entities to look for")
176
  st_entities = st_entities_expander.multiselect(
@@ -182,6 +256,12 @@ st_entities = st_entities_expander.multiselect(
182
  "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
183
  )
184
 
 
 
 
 
 
 
185
  # Read default text
186
  with open("demo_text.txt") as f:
187
  demo_text = f.readlines()
@@ -190,11 +270,9 @@ with open("demo_text.txt") as f:
190
  col1, col2 = st.columns(2)
191
 
192
  # Before:
193
- col1.subheader("Input string:")
194
  st_text = col1.text_area(
195
- label="Enter text",
196
- value="".join(demo_text),
197
- height=400,
198
  )
199
 
200
 
@@ -210,62 +288,65 @@ st_analyze_results = analyze(
210
  )
211
 
212
  # After
213
- if st_operator not in ("highlight", "synthesize"):
214
- with col2:
215
- st.subheader(f"Output")
216
- st_anonymize_results = anonymize(
217
- text=st_text,
218
- operator=st_operator,
219
- mask_char=st_mask_char,
220
- number_of_chars=st_number_of_chars,
221
- encrypt_key=st_encrypt_key,
222
- analyze_results=st_analyze_results,
223
- )
224
- st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
225
- elif st_operator == "synthesize":
226
- with col2:
227
- st.subheader(f"OpenAI Generated output")
228
- fake_data = create_fake_data(
229
- st_text,
230
- st_analyze_results,
231
- openai_key=st_openai_key,
232
- openai_model_name=st_openai_model,
233
- )
234
- st.text_area(label="Synthetic data", value=fake_data, height=400)
235
- else:
236
- st.subheader("Highlighted")
237
- annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
238
- # annotated_tokens
239
- annotated_text(*annotated_tokens)
240
-
241
-
242
- # table result
243
- st.subheader(
244
- "Findings" if not st_return_decision_process else "Findings with decision factors"
245
- )
246
- if st_analyze_results:
247
- df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
248
- df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
249
-
250
- df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
251
- {
252
- "entity_type": "Entity type",
253
- "text": "Text",
254
- "start": "Start",
255
- "end": "End",
256
- "score": "Confidence",
257
- },
258
- axis=1,
259
  )
260
- df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
261
- if st_return_decision_process:
262
- analysis_explanation_df = pd.DataFrame.from_records(
263
- [r.analysis_explanation.to_dict() for r in st_analyze_results]
 
 
 
 
 
 
 
 
 
264
  )
265
- df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
266
- st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
267
- else:
268
- st.text("No findings")
 
 
 
 
 
269
 
270
  components.html(
271
  """
 
1
  """Streamlit app for Presidio."""
2
+ import logging
3
  import os
4
 
5
  import pandas as pd
6
  import streamlit as st
7
  import streamlit.components.v1 as components
8
+ import dotenv
9
  from annotated_text import annotated_text
10
  from streamlit_tags import st_tags
11
 
12
+ from openai_fake_data_generator import OpenAIParams
13
  from presidio_helpers import (
14
  get_supported_entities,
15
  analyze,
 
20
  nlp_engine_and_registry,
21
  )
22
 
23
+ st.set_page_config(
24
+ page_title="Presidio demo",
25
+ layout="wide",
26
+ initial_sidebar_state="expanded",
27
+ menu_items={
28
+ "About": "https://microsoft.github.io/presidio/",
29
+ },
30
+ )
31
+
32
+ dotenv.load_dotenv()
33
+ logger = logging.getLogger("presidio-streamlit")
34
+
35
+
36
+ allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
37
+
38
+ can_present_results = True
39
 
40
  # Sidebar
41
  st.sidebar.header(
42
  """
43
+ PII De-Identification with [Microsoft Presidio](https://microsoft.github.io/presidio/)
44
  """
45
  )
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  model_help_text = """
49
  Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
 
51
  as well as service such as Azure Text Analytics PII.
52
  """
53
  st_ta_key = st_ta_endpoint = ""
 
54
 
55
+ model_list = [
56
+ "spaCy/en_core_web_lg",
57
+ "flair/ner-english-large",
58
+ "HuggingFace/obi/deid_roberta_i2b2",
59
+ "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
60
+ "Azure Text Analytics PII",
61
+ "Other",
62
+ ]
63
+ if allow_other_models:
64
+ model_list.pop()
65
+ # Select model
66
+ st_model = st.sidebar.selectbox(
67
  "NER model package",
68
+ model_list,
69
  index=2,
70
+ help=model_help_text,
71
  )
72
 
73
+ # Extract model package.
74
+ st_model_package = st_model.split("/")[0]
75
+
76
+ # Remove package prefix (if needed)
77
+ st_model = (
78
+ st_model
79
+ if st_model_package not in ("spaCy", "HuggingFace")
80
+ else "/".join(st_model.split("/")[1:])
81
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  if st_model == "Other":
84
+ st_model_package = st.sidebar.selectbox(
85
+ "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
86
+ )
87
+ st_model = st.sidebar.text_input(f"NER model name", value="")
88
+
89
+ if st_model == "Azure Text Analytics PII":
90
+ st_ta_key = st.sidebar.text_input(
91
+ f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
92
+ )
93
+ st_ta_endpoint = st.sidebar.text_input(
94
+ f"Text Analytics endpoint",
95
+ value=os.getenv("TA_ENDPOINT", default=""),
96
+ help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501
97
  )
98
 
99
 
100
  st.sidebar.warning("Note: Models might take some time to download. ")
101
 
102
  analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
103
+ logger.debug(f"analyzer_params: {analyzer_params}")
104
 
105
  st_operator = st.sidebar.selectbox(
106
  "De-identification approach",
 
120
  st_mask_char = "*"
121
  st_number_of_chars = 15
122
  st_encrypt_key = "WmZq4t7w!z%C&F)J"
123
+
124
+ open_ai_params = None
125
+
126
+ logger.debug(f"st_operator: {st_operator}")
127
 
128
  if st_operator == "mask":
129
  st_number_of_chars = st.sidebar.number_input(
 
135
  elif st_operator == "encrypt":
136
  st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
137
  elif st_operator == "synthesize":
138
+ if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
139
+ openai_api_type = "azure"
140
+ st_openai_api_base = st.sidebar.text_input(
141
+ "Azure OpenAI base URL",
142
+ value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
143
+ )
144
+ st_deployment_name = st.sidebar.text_input(
145
+ "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
146
+ )
147
+ st_openai_version = st.sidebar.text_input(
148
+ "OpenAI version",
149
+ value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
150
+ )
151
+ else:
152
+ st_openai_version = openai_api_type = st_openai_api_base = None
153
+ st_deployment_name = ""
154
  st_openai_key = st.sidebar.text_input(
155
  "OPENAI_KEY",
156
  value=os.getenv("OPENAI_KEY", default=""),
 
159
  )
160
  st_openai_model = st.sidebar.text_input(
161
  "OpenAI model for text synthesis",
162
+ value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
163
  help="See more here: https://platform.openai.com/docs/models/",
164
  )
165
+
166
+ open_ai_params = OpenAIParams(
167
+ openai_key=st_openai_key,
168
+ model=st_openai_model,
169
+ api_base=st_openai_api_base,
170
+ deployment_name=st_deployment_name,
171
+ api_version=st_openai_version,
172
+ api_type=openai_api_type,
173
+ )
174
+
175
+ can_present_results = True if st_openai_key else False
176
+
177
  st_threshold = st.sidebar.slider(
178
  label="Acceptance threshold",
179
  min_value=0.0,
 
191
 
192
  # Allow and deny lists
193
  st_deny_allow_expander = st.sidebar.expander(
194
+ "Allowlists and denylists",
195
  expanded=False,
196
  )
197
 
198
  with st_deny_allow_expander:
199
+ st_allow_list = st_tags(
200
+ label="Add words to the allowlist", text="Enter word and press enter."
201
+ )
202
+ st.caption(
203
+ "Allowlists contain words that are not considered PII, but are detected as such."
204
+ )
205
 
206
+ st_deny_list = st_tags(
207
+ label="Add words to the denylist", text="Enter word and press enter."
208
+ )
209
+ st.caption(
210
+ "Denylists contain words that are considered PII, but are not detected as such."
211
+ )
212
  # Main panel
213
+
214
+ with st.expander("About this demo", expanded=False):
215
+ st.info(
216
+ """Presidio is an open source customizable framework for PII detection and de-identification.
217
+ \n\n[Code](https://aka.ms/presidio) |
218
+ [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
219
+ [Installation](https://microsoft.github.io/presidio/installation/) |
220
+ [FAQ](https://microsoft.github.io/presidio/faq/) |"""
221
+ )
222
+
223
+ st.info(
224
+ """
225
+ Use this demo to:
226
+ - Experiment with different off-the-shelf models and NLP packages.
227
+ - Explore the different de-identification options, including redaction, masking, encryption and more.
228
+ - Generate synthetic text with Microsoft Presidio and OpenAI.
229
+ - Configure allow and deny lists.
230
+
231
+ This demo website shows some of Presidio's capabilities.
232
+ [Visit our website](https://microsoft.github.io/presidio) for more info,
233
+ samples and deployment options.
234
+ """
235
+ )
236
+
237
+ st.markdown(
238
+ "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa
239
+ "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
240
+ "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
241
+ )
242
+
243
  analyzer_load_state = st.info("Starting Presidio analyzer...")
244
  nlp_engine, registry = nlp_engine_and_registry(*analyzer_params)
245
 
 
246
  analyzer_load_state.empty()
247
 
 
248
  # Choose entities
249
  st_entities_expander = st.sidebar.expander("Choose entities to look for")
250
  st_entities = st_entities_expander.multiselect(
 
256
  "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
257
  )
258
 
259
+
260
+ analyzer_load_state = st.info("Starting Presidio analyzer...")
261
+ analyzer = analyzer_engine(*analyzer_params)
262
+ analyzer_load_state.empty()
263
+
264
+
265
  # Read default text
266
  with open("demo_text.txt") as f:
267
  demo_text = f.readlines()
 
270
  col1, col2 = st.columns(2)
271
 
272
  # Before:
273
+ col1.subheader("Input")
274
  st_text = col1.text_area(
275
+ label="Enter text", value="".join(demo_text), height=400, key="text_input"
 
 
276
  )
277
 
278
 
 
288
  )
289
 
290
  # After
291
+ if can_present_results:
292
+ if st_operator not in ("highlight", "synthesize"):
293
+ with col2:
294
+ st.subheader(f"Output")
295
+ st_anonymize_results = anonymize(
296
+ text=st_text,
297
+ operator=st_operator,
298
+ mask_char=st_mask_char,
299
+ number_of_chars=st_number_of_chars,
300
+ encrypt_key=st_encrypt_key,
301
+ analyze_results=st_analyze_results,
302
+ )
303
+ st.text_area(
304
+ label="De-identified", value=st_anonymize_results.text, height=400
305
+ )
306
+ elif st_operator == "synthesize":
307
+ with col2:
308
+ st.subheader(f"OpenAI Generated output")
309
+ fake_data = create_fake_data(
310
+ st_text,
311
+ st_analyze_results,
312
+ open_ai_params,
313
+ )
314
+ st.text_area(label="Synthetic data", value=fake_data, height=400)
315
+ else:
316
+ st.subheader("Highlighted")
317
+ annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
318
+ # annotated_tokens
319
+ annotated_text(*annotated_tokens)
320
+
321
+ # table result
322
+ st.subheader(
323
+ "Findings"
324
+ if not st_return_decision_process
325
+ else "Findings with decision factors"
 
 
 
 
 
 
 
 
 
 
 
326
  )
327
+ if st_analyze_results:
328
+ df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
329
+ df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
330
+
331
+ df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
332
+ {
333
+ "entity_type": "Entity type",
334
+ "text": "Text",
335
+ "start": "Start",
336
+ "end": "End",
337
+ "score": "Confidence",
338
+ },
339
+ axis=1,
340
  )
341
+ df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
342
+ if st_return_decision_process:
343
+ analysis_explanation_df = pd.DataFrame.from_records(
344
+ [r.analysis_explanation.to_dict() for r in st_analyze_results]
345
+ )
346
+ df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
347
+ st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
348
+ else:
349
+ st.text("No findings")
350
 
351
  components.html(
352
  """
requirements.txt CHANGED
@@ -3,6 +3,7 @@ presidio-anonymizer
3
  streamlit
4
  streamlit-tags
5
  pandas
 
6
  st-annotated-text
7
  torch
8
  transformers
 
3
  streamlit
4
  streamlit-tags
5
  pandas
6
+ dotenv
7
  st-annotated-text
8
  torch
9
  transformers
text_analytics_wrapper.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from typing import List, Optional
3
-
4
  import dotenv
5
  from azure.ai.textanalytics import TextAnalyticsClient
6
  from azure.core.credentials import AzureKeyCredential
@@ -8,6 +8,8 @@ from azure.core.credentials import AzureKeyCredential
8
  from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
9
  from presidio_analyzer.nlp_engine import NlpArtifacts
10
 
 
 
11
  class TextAnalyticsWrapper(EntityRecognizer):
12
  from azure.ai.textanalytics._models import PiiEntityCategory
13
  TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
 
1
  import os
2
  from typing import List, Optional
3
+ import logging
4
  import dotenv
5
  from azure.ai.textanalytics import TextAnalyticsClient
6
  from azure.core.credentials import AzureKeyCredential
 
8
  from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
9
  from presidio_analyzer.nlp_engine import NlpArtifacts
10
 
11
+ logger = logging.getLogger("presidio-streamlit")
12
+
13
  class TextAnalyticsWrapper(EntityRecognizer):
14
  from azure.ai.textanalytics._models import PiiEntityCategory
15
  TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]