"""Streamlit app for Presidio.""" import logging import os import traceback import dotenv import pandas as pd import streamlit as st import streamlit.components.v1 as components from annotated_text import annotated_text from streamlit_tags import st_tags from openai_fake_data_generator import OpenAIParams from presidio_helpers import ( get_supported_entities, analyze, anonymize, annotate, create_fake_data, analyzer_engine, ) st.set_page_config( page_title="Presidio Testground (do not use)", layout="wide", initial_sidebar_state="expanded", menu_items={ "About": "https://microsoft.github.io/presidio/", }, ) dotenv.load_dotenv() logger = logging.getLogger("presidio-streamlit") allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False) # Sidebar st.sidebar.header( """ De-Identification Testground with [Microsoft Presidio](https://microsoft.github.io/presidio/) """ ) model_help_text = """ Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers. Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair, as well as service such as Azure Text Analytics PII. """ st_ta_key = st_ta_endpoint = "" model_list = [ "spaCy/en_core_web_lg", "spaCy/de_core_news_lg", "flair/ner-english-large", "flair/ner-german", "HuggingFace/obi/deid_roberta_i2b2", "HuggingFace/StanfordAIMI/stanford-deidentifier-base", "stanza/en", "Azure AI Language", "Other", ] if not allow_other_models: model_list.pop() # Select model st_model = st.sidebar.selectbox( "NER model package", model_list, index=2, help=model_help_text, ) # Extract model package. st_model_package = st_model.split("/")[0] # Remove package prefix (if needed) st_model = ( st_model if st_model_package.lower() not in ("spacy", "stanza", "huggingface") else "/".join(st_model.split("/")[1:]) ) if st_model == "Other": st_model_package = st.sidebar.selectbox( "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"] ) st_model = st.sidebar.text_input(f"NER model name", value="") if st_model == "Azure AI Language": st_ta_key = st.sidebar.text_input( f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password" ) st_ta_endpoint = st.sidebar.text_input( f"Azure AI Language endpoint", value=os.getenv("TA_ENDPOINT", default=""), help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501 ) st.sidebar.warning("Note: Models might take some time to download. ") analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint) logger.debug(f"analyzer_params: {analyzer_params}") st_operator = st.sidebar.selectbox( "De-identification approach", ["redact", "replace", "synthesize", "highlight", "mask", "hash", "encrypt"], index=1, help=""" Select which manipulation to the text is requested after PII has been identified.\n - Redact: Completely remove the PII text\n - Replace: Replace the PII text with a constant, e.g. \n - Synthesize: Replace with fake values (requires an OpenAI key)\n - Highlight: Shows the original text with PII highlighted in colors\n - Mask: Replaces a requested number of characters with an asterisk (or other mask character)\n - Hash: Replaces with the hash of the PII string\n - Encrypt: Replaces with an AES encryption of the PII string, allowing the process to be reversed """, ) st_mask_char = "*" st_number_of_chars = 15 st_encrypt_key = "WmZq4t7w!z%C&F)J" open_ai_params = None logger.debug(f"st_operator: {st_operator}") def set_up_openai_synthesis(): """Set up the OpenAI API key and model for text synthesis.""" if os.getenv("OPENAI_TYPE", default="openai") == "Azure": openai_api_type = "azure" st_openai_api_base = st.sidebar.text_input( "Azure OpenAI base URL", value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""), ) openai_key = os.getenv("AZURE_OPENAI_KEY", default="") st_deployment_id = st.sidebar.text_input( "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="") ) st_openai_version = st.sidebar.text_input( "OpenAI version", value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"), ) else: openai_api_type = "openai" st_openai_version = st_openai_api_base = None st_deployment_id = "" openai_key = os.getenv("OPENAI_KEY", default="") st_openai_key = st.sidebar.text_input( "OPENAI_KEY", value=openai_key, help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.", type="password", ) st_openai_model = st.sidebar.text_input( "OpenAI model for text synthesis", value=os.getenv("OPENAI_MODEL", default="gpt-3.5-turbo-instruct"), help="See more here: https://platform.openai.com/docs/models/", ) return ( openai_api_type, st_openai_api_base, st_deployment_id, st_openai_version, st_openai_key, st_openai_model, ) if st_operator == "mask": st_number_of_chars = st.sidebar.number_input( "number of chars", value=st_number_of_chars, min_value=0, max_value=100 ) st_mask_char = st.sidebar.text_input( "Mask character", value=st_mask_char, max_chars=1 ) elif st_operator == "encrypt": st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key) elif st_operator == "synthesize": ( openai_api_type, st_openai_api_base, st_deployment_id, st_openai_version, st_openai_key, st_openai_model, ) = set_up_openai_synthesis() open_ai_params = OpenAIParams( openai_key=st_openai_key, model=st_openai_model, api_base=st_openai_api_base, deployment_id=st_deployment_id, api_version=st_openai_version, api_type=openai_api_type, ) st_threshold = st.sidebar.slider( label="Acceptance threshold", min_value=0.0, max_value=1.0, value=0.35, help="Define the threshold for accepting a detection as PII. See more here: ", ) st_return_decision_process = st.sidebar.checkbox( "Add analysis explanations to findings", value=False, help="Add the decision process to the output table. " "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/", ) # Allow and deny lists st_deny_allow_expander = st.sidebar.expander( "Allowlists and denylists", expanded=False, ) with st_deny_allow_expander: st_allow_list = st_tags( label="Add words to the allowlist", text="Enter word and press enter." ) st.caption( "Allowlists contain words that are not considered PII, but are detected as such." ) st_deny_list = st_tags( label="Add words to the denylist", text="Enter word and press enter." ) st.caption( "Denylists contain words that are considered PII, but are not detected as such." ) # Main panel with st.expander("About this demo", expanded=False): st.info( """Presidio is an open source customizable framework for PII detection and de-identification. \n\n[Code](https://aka.ms/presidio) | [Tutorial](https://microsoft.github.io/presidio/tutorial/) | [Installation](https://microsoft.github.io/presidio/installation/) | [FAQ](https://microsoft.github.io/presidio/faq/) | [Feedback](https://forms.office.com/r/9ufyYjfDaY) |""" ) st.info( """ Use this demo to: - Experiment with different off-the-shelf models and NLP packages. - Explore the different de-identification options, including redaction, masking, encryption and more. - Generate synthetic text with Microsoft Presidio and OpenAI. - Configure allow and deny lists. This demo website shows some of Presidio's capabilities. [Visit our website](https://microsoft.github.io/presidio) for more info, samples and deployment options. """ ) st.markdown( "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)" # noqa "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)" "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)" ) analyzer_load_state = st.info("Starting Presidio analyzer...") analyzer_load_state.empty() # Read default text with open("demo_text.txt") as f: demo_text = f.readlines() # Create two columns for before and after col1, col2 = st.columns(2) # Before: col1.subheader("Input") st_text = col1.text_area( label="Enter text", value="".join(demo_text), height=400, key="text_input" ) try: # Choose entities st_entities_expander = st.sidebar.expander("Choose entities to look for") st_entities = st_entities_expander.multiselect( label="Which entities to look for?", options=get_supported_entities(*analyzer_params), default=list(get_supported_entities(*analyzer_params)), help="Limit the list of PII entities detected. " "This list is dynamic and based on the NER model and registered recognizers. " "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/", ) # Before analyzer_load_state = st.info("Starting Presidio analyzer...") analyzer = analyzer_engine(*analyzer_params) analyzer_load_state.empty() st_analyze_results = analyze( *analyzer_params, text=st_text, entities=st_entities, # entities = ['ACCOUNTNUM', 'BUILDINGNUM', 'CITY', 'CREDITCARDNUMBER', 'DATEOFBIRTH', 'DRIVERLICENSENUM', 'EMAIL', 'GIVENNAME', 'IDCARDNUM', 'PASSWORD', 'SOCIALNUM', 'STREET', 'SURNAME', 'TAXNUM', 'TELEPHONENUM', 'USERNAME', 'ZIPCODE'], language="en", score_threshold=st_threshold, return_decision_process=st_return_decision_process, allow_list=st_allow_list, deny_list=st_deny_list, ) # After if st_operator not in ("highlight", "synthesize"): with col2: st.subheader(f"Output") st_anonymize_results = anonymize( text=st_text, operator=st_operator, mask_char=st_mask_char, number_of_chars=st_number_of_chars, encrypt_key=st_encrypt_key, analyze_results=st_analyze_results, ) st.text_area( label="De-identified", value=st_anonymize_results.text, height=400 ) elif st_operator == "synthesize": with col2: st.subheader(f"OpenAI Generated output") fake_data = create_fake_data( st_text, st_analyze_results, open_ai_params, ) st.text_area(label="Synthetic data", value=fake_data, height=400) else: st.subheader("Highlighted") annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results) # annotated_tokens annotated_text(*annotated_tokens) # table result st.subheader( "Findings" if not st_return_decision_process else "Findings with decision factors" ) if st_analyze_results: df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results]) df["text"] = [st_text[res.start : res.end] for res in st_analyze_results] df_subset = df[["entity_type", "text", "start", "end", "score"]].rename( { "entity_type": "Entity type", "text": "Text", "start": "Start", "end": "End", "score": "Confidence", }, axis=1, ) df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results] if st_return_decision_process: analysis_explanation_df = pd.DataFrame.from_records( [r.analysis_explanation.to_dict() for r in st_analyze_results] ) df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1) st.dataframe(df_subset.reset_index(drop=True), use_container_width=True) else: st.text("No findings") except Exception as e: print(e) traceback.print_exc() st.error(e) components.html( """ """ )