Spaces:

AliaeAI
/

Anonymizer_demo

Sleeping

File size: 11,264 Bytes

b494f67

"""Streamlit app for Presidio."""
import logging
import os
import traceback

import dotenv
import pandas as pd
import streamlit as st
import streamlit.components.v1 as components
from annotated_text import annotated_text
from streamlit_tags import st_tags

# from openai_fake_data_generator import OpenAIParams
from presidio_helpers import (
    get_supported_entities,
    analyze,
    anonymize,
    annotate,
    # create_fake_data,
    analyzer_engine,
)

st.set_page_config(
    page_title="Presidio demo",
    layout="wide",
    initial_sidebar_state="expanded",
    # menu_items={
    #     "About": "https://microsoft.github.io/presidio/",
    # },
)

dotenv.load_dotenv()
logger = logging.getLogger("presidio-streamlit")


allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)


# Sidebar
st.sidebar.header(
    """
Personal Info Anonymization
"""
)

# set aliae logo
st.sidebar.image('logo.png', use_column_width=True)


model_help_text = """
    Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
    Presidio supports multiple NER packages off-the-shelf, such as spaCy, Huggingface, Stanza and Flair,
    as well as service such as Azure Text Analytics PII.
    """
st_ta_key = st_ta_endpoint = ""

model_list = [
    "spaCy/en_core_web_lg",
    "spaCy/fr_core_news_md",
]
# "flair/ner-english-large",
#
# "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
# "Azure Text Analytics PII",
# "Other",


# if not allow_other_models:
#     model_list.pop()


# Select model
lang = st.sidebar.selectbox(
    "Language",
    ['en','fr'],
    index=0,
)

# Extract model package.
# st_model_package = st_model.split("/")[0]
st_model_package = 'spaCy'

# # Remove package prefix (if needed)
# st_model = (
#     st_model
#     if st_model_package not in ("spaCy", "HuggingFace")
#     else "/".join(st_model.split("/")[1:])
# )
st_model = 'en_core_web_lg'
if lang =='en': st_model = 'en_core_web_lg'
elif lang == 'fr' : st_model = 'fr_core_news_md'

# if st_model == "Other":
#     st_model_package = st.sidebar.selectbox(
#         "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
#     )
#     st_model = st.sidebar.text_input(f"NER model name", value="")

# if st_model == "Azure Text Analytics PII":
#     st_ta_key = st.sidebar.text_input(
#         f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
#     )
#     st_ta_endpoint = st.sidebar.text_input(
#         f"Text Analytics endpoint",
#         value=os.getenv("TA_ENDPOINT", default=""),
#         help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
#     )


# st.sidebar.warning("Note: Models might take some time to download. ")

analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
logger.debug(f"analyzer_params: {analyzer_params}")

st_operator = st.sidebar.selectbox(
    "De-identification approach",
    ["redact", "replace", "highlight"],
    index=2,
    help="""
    Select which manipulation to the text is requested after PII has been identified.\n
    - Redact: Completely remove the PII text\n
    - Replace: Replace the PII text with a constant, e.g. <PERSON>\n
    - Highlight: Shows the original text with PII highlighted in colors\n
         """,
)
st_mask_char = "*"
st_number_of_chars = 15
st_encrypt_key = "WmZq4t7w!z%C&F)J"

open_ai_params = None

logger.debug(f"st_operator: {st_operator}")

# if st_operator == "mask":
#     st_number_of_chars = st.sidebar.number_input(
#         "number of chars", value=st_number_of_chars, min_value=0, max_value=100
#     )
#     st_mask_char = st.sidebar.text_input(
#         "Mask character", value=st_mask_char, max_chars=1
#     )
# elif st_operator == "encrypt":
#     st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
# elif st_operator == "synthesize":
#     if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
#         openai_api_type = "azure"
#         st_openai_api_base = st.sidebar.text_input(
#             "Azure OpenAI base URL",
#             value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
#         )
#         st_deployment_name = st.sidebar.text_input(
#             "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
#         )
#         st_openai_version = st.sidebar.text_input(
#             "OpenAI version",
#             value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
#         )
#     else:
#         st_openai_version = openai_api_type = st_openai_api_base = None
#         st_deployment_name = ""
#     st_openai_key = st.sidebar.text_input(
#         "OPENAI_KEY",
#         value=os.getenv("OPENAI_KEY", default=""),
#         help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
#         type="password",
#     )
#     st_openai_model = st.sidebar.text_input(
#         "OpenAI model for text synthesis",
#         value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
#         help="See more here: https://platform.openai.com/docs/models/",
#     )
#
#     open_ai_params = OpenAIParams(
#         openai_key=st_openai_key,
#         model=st_openai_model,
#         api_base=st_openai_api_base,
#         deployment_name=st_deployment_name,
#         api_version=st_openai_version,
#         api_type=openai_api_type,
#     )

# st_threshold = st.sidebar.slider(
#     label="Acceptance threshold",
#     min_value=0.0,
#     max_value=1.0,
#     value=0.35,
#     help="Define the threshold for accepting a detection as PII. See more here: ",
# )
st_threshold = 0.35
#
# st_return_decision_process = st.sidebar.checkbox(
#     "Add analysis explanations to findings",
#     value=False,
#     help="Add the decision process to the output table. "
#     "More information can be found here: https://microsoft.github.io/presidio/analyzer/decision_process/",
# )
st_return_decision_process = False

# # Allow and deny lists
# st_deny_allow_expander = st.sidebar.expander(
#     "Allowlists and denylists",
#     expanded=False,
# )
#
# with st_deny_allow_expander:
#     st_allow_list = st_tags(
#         label="Add words to the allowlist", text="Enter word and press enter."
#     )
#     st.caption(
#         "Allowlists contain words that are not considered PII, but are detected as such."
#     )
#
#     st_deny_list = st_tags(
#         label="Add words to the denylist", text="Enter word and press enter."
#     )
#     st.caption(
#         "Denylists contain words that are considered PII, but are not detected as such."
#     )
st_allow_list = []
st_deny_list = []
# Main panel

with st.expander("About Microsoft Presidio", expanded=False):
    st.info(
        """Presidio is an open source customizable framework for PII detection and de-identification."""
    )

analyzer_load_state = st.info("Starting Presidio analyzer...")

analyzer_load_state.empty()

# Read default text
with open("en_demo_text.txt") as f:
    en_demo_text = f.readlines()
with open("fr_demo_text.txt") as f:
    fr_demo_text = f.readlines()

if lang == 'en': demo_text = en_demo_text
elif lang == 'fr': demo_text = fr_demo_text

# Create two columns for before and after
col1, col2 = st.columns(2)

# Before:
col1.subheader("Input")
st_text = col1.text_area(
    label="Enter text", value="".join(demo_text), height=400, key="text_input"
)

try:
    # Choose entities
    st_entities_expander = st.sidebar.expander("Choose entities to look for")
    st_entities = st_entities_expander.multiselect(
        label="Which entities to look for?",
        options=get_supported_entities(*analyzer_params),
        default=list(get_supported_entities(*analyzer_params)),
        help="Limit the list of PII entities detected. "
        "This list is dynamic and based on the NER model and registered recognizers. "
        "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
    )

    # Before
    analyzer_load_state = st.info("Starting Presidio analyzer...")
    analyzer = analyzer_engine(*analyzer_params)
    analyzer_load_state.empty()

    st_analyze_results = analyze(
        *analyzer_params,
        text=st_text,
        entities=st_entities,
        language=lang,
        score_threshold=st_threshold,
        return_decision_process=st_return_decision_process,
        allow_list=st_allow_list,
        deny_list=st_deny_list,
    )

    # After
    if st_operator not in ("highlight", "synthesize"):
        with col2:
            st.subheader(f"Output")
            st_anonymize_results = anonymize(
                text=st_text,
                operator=st_operator,
                mask_char=st_mask_char,
                number_of_chars=st_number_of_chars,
                encrypt_key=st_encrypt_key,
                analyze_results=st_analyze_results,
            )
            st.text_area(
                label="De-identified", value=st_anonymize_results.text, height=400
            )
    # elif st_operator == "synthesize":
    #     with col2:
    #         st.subheader(f"OpenAI Generated output")
    #         fake_data = create_fake_data(
    #             st_text,
    #             st_analyze_results,
    #             open_ai_params,
    #         )
    #         st.text_area(label="Synthetic data", value=fake_data, height=400)
    else:
        st.subheader("Highlighted")
        annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
        # annotated_tokens
        annotated_text(*annotated_tokens)

    # table result
    st.subheader(
        "Findings"
        if not st_return_decision_process
        else "Findings with decision factors"
    )
    if st_analyze_results:
        df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
        df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]

        df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
            {
                "entity_type": "Entity type",
                "text": "Text",
                "start": "Start",
                "end": "End",
                "score": "Confidence",
            },
            axis=1,
        )
        df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
        if st_return_decision_process:
            analysis_explanation_df = pd.DataFrame.from_records(
                [r.analysis_explanation.to_dict() for r in st_analyze_results]
            )
            df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
        st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
    else:
        st.text("No findings")

except Exception as e:
    print(e)
    traceback.print_exc()
    st.error(e)

components.html(
    """
    <script type="text/javascript">
    (function(c,l,a,r,i,t,y){
        c[a]=c[a]||function(){(c[a].q=c[a].q||[]).push(arguments)};
        t=l.createElement(r);t.async=1;t.src="https://www.clarity.ms/tag/"+i;
        y=l.getElementsByTagName(r)[0];y.parentNode.insertBefore(t,y);
    })(window, document, "clarity", "script", "h7f8bp42n8");
    </script>
    """
)