presidio commited on
Commit
d6241cc
·
1 Parent(s): 3afd122

Upload 7 files

Browse files
Files changed (2) hide show
  1. presidio_helpers.py +203 -0
  2. presidio_streamlit.py +44 -200
presidio_helpers.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Helper methods for the Presidio Streamlit app
3
+ """
4
+ from typing import List, Optional
5
+
6
+ import spacy
7
+ import streamlit as st
8
+ from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
9
+ from presidio_analyzer.nlp_engine import NlpEngineProvider
10
+ from presidio_anonymizer import AnonymizerEngine
11
+ from presidio_anonymizer.entities import OperatorConfig
12
+
13
+ from flair_recognizer import FlairRecognizer
14
+ from openai_fake_data_generator import (
15
+ set_openai_key,
16
+ call_completion_model,
17
+ create_prompt,
18
+ )
19
+ from transformers_rec import (
20
+ STANFORD_COFIGURATION,
21
+ TransformersRecognizer,
22
+ BERT_DEID_CONFIGURATION,
23
+ )
24
+
25
+
26
+ @st.cache_resource
27
+ def analyzer_engine(model_path: str):
28
+ """Return AnalyzerEngine.
29
+
30
+ :param model_path: Which model to use for NER:
31
+ "StanfordAIMI/stanford-deidentifier-base",
32
+ "obi/deid_roberta_i2b2",
33
+ "en_core_web_lg"
34
+ """
35
+
36
+ registry = RecognizerRegistry()
37
+ registry.load_predefined_recognizers()
38
+
39
+ # Set up NLP Engine according to the model of choice
40
+ if model_path == "en_core_web_lg":
41
+ if not spacy.util.is_package("en_core_web_lg"):
42
+ spacy.cli.download("en_core_web_lg")
43
+ nlp_configuration = {
44
+ "nlp_engine_name": "spacy",
45
+ "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
46
+ }
47
+ elif model_path == "flair/ner-english-large":
48
+ flair_recognizer = FlairRecognizer()
49
+ nlp_configuration = {
50
+ "nlp_engine_name": "spacy",
51
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
52
+ }
53
+ registry.add_recognizer(flair_recognizer)
54
+ registry.remove_recognizer("SpacyRecognizer")
55
+ else:
56
+ if not spacy.util.is_package("en_core_web_sm"):
57
+ spacy.cli.download("en_core_web_sm")
58
+ # Using a small spaCy model + a HF NER model
59
+ transformers_recognizer = TransformersRecognizer(model_path=model_path)
60
+ registry.remove_recognizer("SpacyRecognizer")
61
+ if model_path == "StanfordAIMI/stanford-deidentifier-base":
62
+ transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
63
+ elif model_path == "obi/deid_roberta_i2b2":
64
+ transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
65
+
66
+ # Use small spaCy model, no need for both spacy and HF models
67
+ # The transformers model is used here as a recognizer, not as an NlpEngine
68
+ nlp_configuration = {
69
+ "nlp_engine_name": "spacy",
70
+ "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
71
+ }
72
+
73
+ registry.add_recognizer(transformers_recognizer)
74
+
75
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
76
+
77
+ analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
78
+ return analyzer
79
+
80
+
81
+ @st.cache_resource
82
+ def anonymizer_engine():
83
+ """Return AnonymizerEngine."""
84
+ return AnonymizerEngine()
85
+
86
+
87
+ @st.cache_data
88
+ def get_supported_entities(st_model: str):
89
+ """Return supported entities from the Analyzer Engine."""
90
+ return analyzer_engine(st_model).get_supported_entities()
91
+
92
+
93
+ @st.cache_data
94
+ def analyze(st_model: str, **kwargs):
95
+ """Analyze input using Analyzer engine and input arguments (kwargs)."""
96
+ if "entities" not in kwargs or "All" in kwargs["entities"]:
97
+ kwargs["entities"] = None
98
+ return analyzer_engine(st_model).analyze(**kwargs)
99
+
100
+
101
+ def anonymize(
102
+ text: str,
103
+ operator: str,
104
+ analyze_results: List[RecognizerResult],
105
+ mask_char: Optional[str] = None,
106
+ number_of_chars: Optional[str] = None,
107
+ encrypt_key: Optional[str] = None,
108
+ ):
109
+ """Anonymize identified input using Presidio Anonymizer.
110
+
111
+ :param text: Full text
112
+ :param operator: Operator name
113
+ :param mask_char: Mask char (for mask operator)
114
+ :param number_of_chars: Number of characters to mask (for mask operator)
115
+ :param encrypt_key: Encryption key (for encrypt operator)
116
+ :param analyze_results: list of results from presidio analyzer engine
117
+ """
118
+
119
+ if operator == "mask":
120
+ operator_config = {
121
+ "type": "mask",
122
+ "masking_char": mask_char,
123
+ "chars_to_mask": number_of_chars,
124
+ "from_end": False,
125
+ }
126
+
127
+ # Define operator config
128
+ elif operator == "encrypt":
129
+ operator_config = {"key": encrypt_key}
130
+ elif operator == "highlight":
131
+ operator_config = {"lambda": lambda x: x}
132
+ else:
133
+ operator_config = None
134
+
135
+ # Change operator if needed as intermediate step
136
+ if operator == "highlight":
137
+ operator = "custom"
138
+ elif operator == "synthesize":
139
+ operator = "replace"
140
+ else:
141
+ operator = operator
142
+
143
+ res = anonymizer_engine().anonymize(
144
+ text,
145
+ analyze_results,
146
+ operators={"DEFAULT": OperatorConfig(operator, operator_config)},
147
+ )
148
+ return res
149
+
150
+
151
+ def annotate(text: str, analyze_results: List[RecognizerResult]):
152
+ """Highlight the identified PII entities on the original text
153
+
154
+ :param text: Full text
155
+ :param analyze_results: list of results from presidio analyzer engine
156
+ """
157
+ tokens = []
158
+
159
+ # Use the anonymizer to resolve overlaps
160
+ results = anonymize(
161
+ text=text,
162
+ operator="highlight",
163
+ analyze_results=analyze_results,
164
+ )
165
+
166
+ # sort by start index
167
+ results = sorted(results.items, key=lambda x: x.start)
168
+ for i, res in enumerate(results):
169
+ if i == 0:
170
+ tokens.append(text[: res.start])
171
+
172
+ # append entity text and entity type
173
+ tokens.append((text[res.start : res.end], res.entity_type))
174
+
175
+ # if another entity coming i.e. we're not at the last results element, add text up to next entity
176
+ if i != len(results) - 1:
177
+ tokens.append(text[res.end : results[i + 1].start])
178
+ # if no more entities coming, add all remaining text
179
+ else:
180
+ tokens.append(text[res.end :])
181
+ return tokens
182
+
183
+
184
+ def create_fake_data(
185
+ text: str,
186
+ analyze_results: List[RecognizerResult],
187
+ openai_key: str,
188
+ openai_model_name: str,
189
+ ):
190
+ """Creates a synthetic version of the text using OpenAI APIs"""
191
+ if not openai_key:
192
+ return "Please provide your OpenAI key"
193
+ results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
194
+ set_openai_key(openai_key)
195
+ prompt = create_prompt(results.text)
196
+ fake = call_openai_api(prompt, openai_model_name)
197
+ return fake
198
+
199
+
200
+ @st.cache_data
201
+ def call_openai_api(prompt: str, openai_model_name: str) -> str:
202
+ fake_data = call_completion_model(prompt, model=openai_model_name)
203
+ return fake_data
presidio_streamlit.py CHANGED
@@ -1,197 +1,20 @@
1
  """Streamlit app for Presidio."""
2
  import os
3
  from json import JSONEncoder
4
- from typing import List
5
 
6
  import pandas as pd
7
- import spacy
8
  import streamlit as st
9
  from annotated_text import annotated_text
10
- from presidio_analyzer import AnalyzerEngine, RecognizerResult, RecognizerRegistry
11
- from presidio_analyzer.nlp_engine import NlpEngineProvider
12
- from presidio_anonymizer import AnonymizerEngine
13
- from presidio_anonymizer.entities import OperatorConfig
14
 
15
- from flair_recognizer import FlairRecognizer
16
- from transformers_rec import (
17
- STANFORD_COFIGURATION,
18
- TransformersRecognizer,
19
- BERT_DEID_CONFIGURATION,
 
 
20
  )
21
 
22
- from openai_fake_data_generator import (
23
- set_openai_key,
24
- call_completion_model,
25
- create_prompt,
26
- )
27
-
28
-
29
- # Helper methods
30
- @st.cache_resource
31
- def analyzer_engine(model_path: str):
32
- """Return AnalyzerEngine.
33
-
34
- :param model_path: Which model to use for NER:
35
- "StanfordAIMI/stanford-deidentifier-base",
36
- "obi/deid_roberta_i2b2",
37
- "en_core_web_lg"
38
- """
39
-
40
- registry = RecognizerRegistry()
41
- registry.load_predefined_recognizers()
42
-
43
- # Set up NLP Engine according to the model of choice
44
- if model_path == "en_core_web_lg":
45
- if not spacy.util.is_package("en_core_web_lg"):
46
- spacy.cli.download("en_core_web_lg")
47
- nlp_configuration = {
48
- "nlp_engine_name": "spacy",
49
- "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
50
- }
51
- elif model_path == "flair/ner-english-large":
52
- flair_recognizer = FlairRecognizer()
53
- nlp_configuration = {
54
- "nlp_engine_name": "spacy",
55
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
56
- }
57
- registry.add_recognizer(flair_recognizer)
58
- registry.remove_recognizer("SpacyRecognizer")
59
- else:
60
- if not spacy.util.is_package("en_core_web_sm"):
61
- spacy.cli.download("en_core_web_sm")
62
- # Using a small spaCy model + a HF NER model
63
- transformers_recognizer = TransformersRecognizer(model_path=model_path)
64
- registry.remove_recognizer("SpacyRecognizer")
65
- if model_path == "StanfordAIMI/stanford-deidentifier-base":
66
- transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
67
- elif model_path == "obi/deid_roberta_i2b2":
68
- transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
69
-
70
- # Use small spaCy model, no need for both spacy and HF models
71
- # The transformers model is used here as a recognizer, not as an NlpEngine
72
- nlp_configuration = {
73
- "nlp_engine_name": "spacy",
74
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
75
- }
76
-
77
- registry.add_recognizer(transformers_recognizer)
78
-
79
- nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
80
-
81
- analyzer = AnalyzerEngine(nlp_engine=nlp_engine, registry=registry)
82
- return analyzer
83
-
84
-
85
- @st.cache_resource
86
- def anonymizer_engine():
87
- """Return AnonymizerEngine."""
88
- return AnonymizerEngine()
89
-
90
-
91
- @st.cache_data
92
- def get_supported_entities():
93
- """Return supported entities from the Analyzer Engine."""
94
- return analyzer_engine(st_model).get_supported_entities()
95
-
96
-
97
- @st.cache_data
98
- def analyze(**kwargs):
99
- """Analyze input using Analyzer engine and input arguments (kwargs)."""
100
- if "entities" not in kwargs or "All" in kwargs["entities"]:
101
- kwargs["entities"] = None
102
- return analyzer_engine(st_model).analyze(**kwargs)
103
-
104
-
105
- def anonymize(text: str, analyze_results: List[RecognizerResult]):
106
- """Anonymize identified input using Presidio Anonymizer.
107
-
108
- :param text: Full text
109
- :param analyze_results: list of results from presidio analyzer engine
110
- """
111
-
112
- if st_operator == "mask":
113
- operator_config = {
114
- "type": "mask",
115
- "masking_char": st_mask_char,
116
- "chars_to_mask": st_number_of_chars,
117
- "from_end": False,
118
- }
119
-
120
- # Define operator config
121
- elif st_operator == "encrypt":
122
- operator_config = {"key": st_encrypt_key}
123
- elif st_operator == "highlight":
124
- operator_config = {"lambda": lambda x: x}
125
- else:
126
- operator_config = None
127
-
128
- # Change operator if needed as intermediate step
129
- if st_operator == "highlight":
130
- operator = "custom"
131
- elif st_operator == "synthesize":
132
- operator = "replace"
133
- else:
134
- operator = st_operator
135
-
136
- res = anonymizer_engine().anonymize(
137
- text,
138
- analyze_results,
139
- operators={"DEFAULT": OperatorConfig(operator, operator_config)},
140
- )
141
- return res
142
-
143
-
144
- def annotate(text: str, analyze_results: List[RecognizerResult]):
145
- """
146
- Highlights every identified entity on top of the text.
147
- :param text: full text
148
- :param analyze_results: list of analyzer results.
149
- """
150
- tokens = []
151
-
152
- # Use the anonymizer to resolve overlaps
153
- results = anonymize(text, analyze_results)
154
-
155
- # sort by start index
156
- results = sorted(results.items, key=lambda x: x.start)
157
- for i, res in enumerate(results):
158
- if i == 0:
159
- tokens.append(text[: res.start])
160
-
161
- # append entity text and entity type
162
- tokens.append((text[res.start : res.end], res.entity_type))
163
-
164
- # if another entity coming i.e. we're not at the last results element, add text up to next entity
165
- if i != len(results) - 1:
166
- tokens.append(text[res.end : results[i + 1].start])
167
- # if no more entities coming, add all remaining text
168
- else:
169
- tokens.append(text[res.end :])
170
- return tokens
171
-
172
-
173
- def create_fake_data(
174
- text: str,
175
- analyze_results: List[RecognizerResult],
176
- openai_key: str,
177
- openai_model_name: str,
178
- ):
179
- """Creates a synthetic version of the text using OpenAI APIs"""
180
- if not openai_key:
181
- return "Please provide your OpenAI key"
182
- results = anonymize(text, analyze_results)
183
- set_openai_key(openai_key)
184
- prompt = create_prompt(results.text)
185
- fake = call_openai_api(prompt, openai_model_name)
186
- return fake
187
-
188
-
189
- @st.cache_data
190
- def call_openai_api(prompt: str, openai_model_name: str) -> str:
191
- fake_data = call_completion_model(prompt, model=openai_model_name)
192
- return fake_data
193
-
194
-
195
  st.set_page_config(page_title="Presidio demo", layout="wide")
196
 
197
  # Sidebar
@@ -211,8 +34,8 @@ st.sidebar.info(
211
  )
212
 
213
  st.sidebar.markdown(
214
- "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"
215
- "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](http://opensource.org/licenses/MIT)"
216
  "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
217
  )
218
 
@@ -247,14 +70,20 @@ st_operator = st.sidebar.selectbox(
247
  - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
248
  """,
249
  )
250
-
 
 
 
 
251
  if st_operator == "mask":
252
  st_number_of_chars = st.sidebar.number_input(
253
- "number of chars", value=15, min_value=0, max_value=100
 
 
 
254
  )
255
- st_mask_char = st.sidebar.text_input("Mask character", value="*", max_chars=1)
256
  elif st_operator == "encrypt":
257
- st_encrypt_key = st.sidebar.text_input("AES key", value="WmZq4t7w!z%C&F)J")
258
  elif st_operator == "synthesize":
259
  st_openai_key = st.sidebar.text_input(
260
  "OPENAI_KEY",
@@ -264,7 +93,7 @@ elif st_operator == "synthesize":
264
  )
265
  st_openai_model = st.sidebar.text_input(
266
  "OpenAI model for text synthesis",
267
- value="text-davinci-003",
268
  help="See more here: https://platform.openai.com/docs/models/",
269
  )
270
  st_threshold = st.sidebar.slider(
@@ -276,15 +105,19 @@ st_threshold = st.sidebar.slider(
276
  )
277
 
278
  st_return_decision_process = st.sidebar.checkbox(
279
- "Add analysis explanations to findings", value=False,
280
- help="Add the decision process to the output table. More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/"
 
 
281
  )
282
 
283
  st_entities = st.sidebar.multiselect(
284
  label="Which entities to look for?",
285
- options=get_supported_entities(),
286
- default=list(get_supported_entities()),
287
- help="Limit the list of PII entities detected. This list is dynamic and based on the NER model and registered recognizers. More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/"
 
 
288
  )
289
 
290
  # Main panel
@@ -308,6 +141,7 @@ st_text = col1.text_area(
308
  )
309
 
310
  st_analyze_results = analyze(
 
311
  text=st_text,
312
  entities=st_entities,
313
  language="en",
@@ -319,7 +153,14 @@ st_analyze_results = analyze(
319
  if st_operator not in ("highlight", "synthesize"):
320
  with col2:
321
  st.subheader(f"Output")
322
- st_anonymize_results = anonymize(st_text, st_analyze_results)
 
 
 
 
 
 
 
323
  st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
324
  elif st_operator == "synthesize":
325
  with col2:
@@ -333,7 +174,10 @@ elif st_operator == "synthesize":
333
  st.text_area(label="Synthetic data", value=fake_data, height=400)
334
  else:
335
  st.subheader("Highlighted")
336
- annotated_tokens = annotate(st_text, st_analyze_results)
 
 
 
337
  # annotated_tokens
338
  annotated_text(*annotated_tokens)
339
 
@@ -353,7 +197,7 @@ st.subheader(
353
  )
354
  if st_analyze_results:
355
  df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
356
- df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
357
 
358
  df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
359
  {
@@ -365,7 +209,7 @@ if st_analyze_results:
365
  },
366
  axis=1,
367
  )
368
- df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
369
  if st_return_decision_process:
370
  analysis_explanation_df = pd.DataFrame.from_records(
371
  [r.analysis_explanation.to_dict() for r in st_analyze_results]
 
1
  """Streamlit app for Presidio."""
2
  import os
3
  from json import JSONEncoder
 
4
 
5
  import pandas as pd
 
6
  import streamlit as st
7
  from annotated_text import annotated_text
 
 
 
 
8
 
9
+ from presidio_helpers import (
10
+ get_supported_entities,
11
+ analyze,
12
+ anonymize,
13
+ annotate,
14
+ create_fake_data,
15
+ analyzer_engine,
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  st.set_page_config(page_title="Presidio demo", layout="wide")
19
 
20
  # Sidebar
 
34
  )
35
 
36
  st.sidebar.markdown(
37
+ "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa
38
+ "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
39
  "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
40
  )
41
 
 
70
  - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed
71
  """,
72
  )
73
+ st_mask_char = "*"
74
+ st_number_of_chars = 15
75
+ st_encrypt_key = "WmZq4t7w!z%C&F)J"
76
+ st_openai_key = ""
77
+ st_openai_model = "text-davinci-003"
78
  if st_operator == "mask":
79
  st_number_of_chars = st.sidebar.number_input(
80
+ "number of chars", value=st_number_of_chars, min_value=0, max_value=100
81
+ )
82
+ st_mask_char = st.sidebar.text_input(
83
+ "Mask character", value=st_mask_char, max_chars=1
84
  )
 
85
  elif st_operator == "encrypt":
86
+ st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
87
  elif st_operator == "synthesize":
88
  st_openai_key = st.sidebar.text_input(
89
  "OPENAI_KEY",
 
93
  )
94
  st_openai_model = st.sidebar.text_input(
95
  "OpenAI model for text synthesis",
96
+ value=st_openai_model,
97
  help="See more here: https://platform.openai.com/docs/models/",
98
  )
99
  st_threshold = st.sidebar.slider(
 
105
  )
106
 
107
  st_return_decision_process = st.sidebar.checkbox(
108
+ "Add analysis explanations to findings",
109
+ value=False,
110
+ help="Add the decision process to the output table. "
111
+ "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
112
  )
113
 
114
  st_entities = st.sidebar.multiselect(
115
  label="Which entities to look for?",
116
+ options=get_supported_entities(st_model),
117
+ default=list(get_supported_entities(st_model)),
118
+ help="Limit the list of PII entities detected. "
119
+ "This list is dynamic and based on the NER model and registered recognizers. "
120
+ "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
121
  )
122
 
123
  # Main panel
 
141
  )
142
 
143
  st_analyze_results = analyze(
144
+ st_model=st_model,
145
  text=st_text,
146
  entities=st_entities,
147
  language="en",
 
153
  if st_operator not in ("highlight", "synthesize"):
154
  with col2:
155
  st.subheader(f"Output")
156
+ st_anonymize_results = anonymize(
157
+ text=st_text,
158
+ operator=st_operator,
159
+ mask_char=st_mask_char,
160
+ number_of_chars=st_number_of_chars,
161
+ encrypt_key=st_encrypt_key,
162
+ analyze_results=st_analyze_results,
163
+ )
164
  st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
165
  elif st_operator == "synthesize":
166
  with col2:
 
174
  st.text_area(label="Synthetic data", value=fake_data, height=400)
175
  else:
176
  st.subheader("Highlighted")
177
+ annotated_tokens = annotate(
178
+ text=st_text,
179
+ analyze_results=st_analyze_results
180
+ )
181
  # annotated_tokens
182
  annotated_text(*annotated_tokens)
183
 
 
197
  )
198
  if st_analyze_results:
199
  df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
200
+ df["text"] = [st_text[res.start: res.end] for res in st_analyze_results]
201
 
202
  df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
203
  {
 
209
  },
210
  axis=1,
211
  )
212
+ df_subset["Text"] = [st_text[res.start: res.end] for res in st_analyze_results]
213
  if st_return_decision_process:
214
  analysis_explanation_df = pd.DataFrame.from_records(
215
  [r.analysis_explanation.to_dict() for r in st_analyze_results]