|
"""Streamlit app for Presidio.""" |
|
import os |
|
|
|
import pandas as pd |
|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
|
|
from annotated_text import annotated_text |
|
from streamlit_tags import st_tags |
|
|
|
from presidio_helpers import ( |
|
get_supported_entities, |
|
analyze, |
|
anonymize, |
|
annotate, |
|
create_fake_data, |
|
analyzer_engine, |
|
nlp_engine_and_registry, |
|
) |
|
|
|
st.set_page_config(page_title="Presidio demo", layout="wide") |
|
|
|
|
|
st.sidebar.header( |
|
""" |
|
PII De-Identification with Microsoft Presidio |
|
""" |
|
) |
|
|
|
st.sidebar.info( |
|
"Presidio is an open source customizable framework for PII detection and de-identification\n" |
|
"[Code](https://aka.ms/presidio) | " |
|
"[Tutorial](https://microsoft.github.io/presidio/tutorial/) | " |
|
"[Installation](https://microsoft.github.io/presidio/installation/) | " |
|
"[FAQ](https://microsoft.github.io/presidio/faq/)", |
|
icon="ℹ️", |
|
) |
|
|
|
st.sidebar.markdown( |
|
"[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" |
|
"[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)" |
|
"![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)" |
|
) |
|
|
|
model_help_text = """ |
|
Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers. |
|
Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair, |
|
as well as service such as Azure Text Analytics PII. |
|
""" |
|
st_ta_key = st_ta_endpoint = "" |
|
st_model = "en_core_web_lg" |
|
|
|
st_model_package = st.sidebar.selectbox( |
|
"NER model package", |
|
["spaCy", "flair", "HuggingFace", "Azure Text Analytics"], |
|
index=2, |
|
help="Select the NLP package to use for PII detection", |
|
) |
|
|
|
if st_model_package == "spaCy": |
|
st_model = st.sidebar.selectbox( |
|
"NER model for PII detection", |
|
["en_core_web_lg", "en_core_web_trf", "Other"], |
|
help=model_help_text, |
|
) |
|
elif st_model_package == "HuggingFace": |
|
st_model = st.sidebar.selectbox( |
|
"NER model for PII detection", |
|
["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base", "Other"], |
|
help=model_help_text, |
|
) |
|
elif st_model_package == "flair": |
|
st_model = st.sidebar.selectbox( |
|
"NER model for PII detection", |
|
["flair/ner-english-large", "Other"], |
|
help=model_help_text, |
|
) |
|
elif st_model_package == "Azure Text Analytics": |
|
st_model = st.sidebar.selectbox( |
|
"NER model for PII detection", |
|
["Azure Text Analytics PII"], |
|
help=model_help_text, |
|
) |
|
st_ta_key = st.sidebar.text_input("Text Analytics Key", type="password") |
|
st_ta_endpoint = st.sidebar.text_input("Text Analytics Endpoint") |
|
|
|
if st_model == "Other": |
|
st_model = st.sidebar.text_input( |
|
f"NER model name for package {st_model_package}", value="" |
|
) |
|
|
|
|
|
st.sidebar.warning("Note: Models might take some time to download. ") |
|
|
|
analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint) |
|
|
|
st_operator = st.sidebar.selectbox( |
|
"De-identification approach", |
|
["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"], |
|
index=1, |
|
help=""" |
|
Select which manipulation to the text is requested after PII has been identified.\n |
|
- Redact: Completely remove the PII text\n |
|
- Replace: Replace the PII text with a constant, e.g. <PERSON>\n |
|
- Synthesize: Replace with fake values (requires an OpenAI key)\n |
|
- Highlight: Shows the original text with PII highlighted in colors\n |
|
- Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n |
|
- Hash: Replaces with the hash of the PII string\n |
|
- Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed |
|
""", |
|
) |
|
st_mask_char = "*" |
|
st_number_of_chars = 15 |
|
st_encrypt_key = "WmZq4t7w!z%C&F)J" |
|
st_openai_key = "" |
|
st_openai_model = "text-davinci-003" |
|
|
|
if st_operator == "mask": |
|
st_number_of_chars = st.sidebar.number_input( |
|
"number of chars", value=st_number_of_chars, min_value=0, max_value=100 |
|
) |
|
st_mask_char = st.sidebar.text_input( |
|
"Mask character", value=st_mask_char, max_chars=1 |
|
) |
|
elif st_operator == "encrypt": |
|
st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key) |
|
elif st_operator == "synthesize": |
|
st_openai_key = st.sidebar.text_input( |
|
"OPENAI_KEY", |
|
value=os.getenv("OPENAI_KEY", default=""), |
|
help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.", |
|
type="password", |
|
) |
|
st_openai_model = st.sidebar.text_input( |
|
"OpenAI model for text synthesis", |
|
value=st_openai_model, |
|
help="See more here: https://platform.openai.com/docs/models/", |
|
) |
|
st_threshold = st.sidebar.slider( |
|
label="Acceptance threshold", |
|
min_value=0.0, |
|
max_value=1.0, |
|
value=0.35, |
|
help="Define the threshold for accepting a detection as PII. See more here: ", |
|
) |
|
|
|
st_return_decision_process = st.sidebar.checkbox( |
|
"Add analysis explanations to findings", |
|
value=False, |
|
help="Add the decision process to the output table. " |
|
"More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/", |
|
) |
|
|
|
|
|
st_deny_allow_expander = st.sidebar.expander( |
|
"Allow and deny lists", |
|
expanded=False, |
|
) |
|
|
|
with st_deny_allow_expander: |
|
st_allow_list = st_tags(label="Add words to the allow list", text="Enter word and press enter.") |
|
st.caption('Allow lists contain words that are not considered PII, but are detected as such.') |
|
|
|
st_deny_list = st_tags(label="Add words to the deny list", text="Enter word and press enter.") |
|
st.caption("Deny lists contain words that are considered PII, but are not detected as such.") |
|
|
|
analyzer_load_state = st.info("Starting Presidio analyzer...") |
|
nlp_engine, registry = nlp_engine_and_registry(*analyzer_params) |
|
|
|
analyzer = analyzer_engine(*analyzer_params) |
|
analyzer_load_state.empty() |
|
|
|
|
|
|
|
st_entities_expander = st.sidebar.expander("Choose entities to look for") |
|
st_entities = st_entities_expander.multiselect( |
|
label="Which entities to look for?", |
|
options=get_supported_entities(*analyzer_params), |
|
default=list(get_supported_entities(*analyzer_params)), |
|
help="Limit the list of PII entities detected. " |
|
"This list is dynamic and based on the NER model and registered recognizers. " |
|
"More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/", |
|
) |
|
|
|
|
|
with open("demo_text.txt") as f: |
|
demo_text = f.readlines() |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
col1.subheader("Input string:") |
|
st_text = col1.text_area( |
|
label="Enter text", |
|
value="".join(demo_text), |
|
height=400, |
|
) |
|
|
|
|
|
st_analyze_results = analyze( |
|
*analyzer_params, |
|
text=st_text, |
|
entities=st_entities, |
|
language="en", |
|
score_threshold=st_threshold, |
|
return_decision_process=st_return_decision_process, |
|
allow_list=st_allow_list, |
|
deny_list=st_deny_list, |
|
) |
|
|
|
|
|
if st_operator not in ("highlight", "synthesize"): |
|
with col2: |
|
st.subheader(f"Output") |
|
st_anonymize_results = anonymize( |
|
text=st_text, |
|
operator=st_operator, |
|
mask_char=st_mask_char, |
|
number_of_chars=st_number_of_chars, |
|
encrypt_key=st_encrypt_key, |
|
analyze_results=st_analyze_results, |
|
) |
|
st.text_area(label="De-identified", value=st_anonymize_results.text, height=400) |
|
elif st_operator == "synthesize": |
|
with col2: |
|
st.subheader(f"OpenAI Generated output") |
|
fake_data = create_fake_data( |
|
st_text, |
|
st_analyze_results, |
|
openai_key=st_openai_key, |
|
openai_model_name=st_openai_model, |
|
) |
|
st.text_area(label="Synthetic data", value=fake_data, height=400) |
|
else: |
|
st.subheader("Highlighted") |
|
annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results) |
|
|
|
annotated_text(*annotated_tokens) |
|
|
|
|
|
|
|
st.subheader( |
|
"Findings" if not st_return_decision_process else "Findings with decision factors" |
|
) |
|
if st_analyze_results: |
|
df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results]) |
|
df["text"] = [st_text[res.start : res.end] for res in st_analyze_results] |
|
|
|
df_subset = df[["entity_type", "text", "start", "end", "score"]].rename( |
|
{ |
|
"entity_type": "Entity type", |
|
"text": "Text", |
|
"start": "Start", |
|
"end": "End", |
|
"score": "Confidence", |
|
}, |
|
axis=1, |
|
) |
|
df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results] |
|
if st_return_decision_process: |
|
analysis_explanation_df = pd.DataFrame.from_records( |
|
[r.analysis_explanation.to_dict() for r in st_analyze_results] |
|
) |
|
df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1) |
|
st.dataframe(df_subset.reset_index(drop=True), use_container_width=True) |
|
else: |
|
st.text("No findings") |
|
|
|
components.html( |
|
""" |
|
<script type="text/javascript"> |
|
(function(c,l,a,r,i,t,y){ |
|
c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)}; |
|
t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i; |
|
y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y); |
|
})(window, document, "clarity", "script", "h7f8bp42n8"); |
|
</script> |
|
""" |
|
) |
|
|