Spaces:

omri374
/

presidio

Build error

App Files Files Community

omri374 commited on May 29, 2023

Commit

7172378

•

1 Parent(s): f5f7ba5

Upload 10 files

Browse files

Files changed (6) hide show

openai_fake_data_generator.py +33 -8
presidio_helpers.py +41 -11
presidio_nlp_engine_config.py +3 -1
presidio_streamlit.py +198 -117
requirements.txt +1 -0
text_analytics_wrapper.py +3 -1

openai_fake_data_generator.py CHANGED Viewed

@@ -1,25 +1,50 @@
 import openai
-def set_openai_key(openai_key: str):
     """Set the OpenAI API key.
-    :param openai_key: the open AI key (https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key)
     """
-    openai.api_key = openai_key
 def call_completion_model(
-    prompt: str, model: str = "text-davinci-003", max_tokens: int = 512
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
     :param model: OpenAI model name
     :param max_tokens: Model's max_tokens parameter
     """
-    response = openai.Completion.create(
-        model=model, prompt=prompt, max_tokens=max_tokens
-    )
     return response["choices"][0].text

+from collections import namedtuple
+from typing import Optional
 import openai
+import logging
+logger = logging.getLogger("presidio-streamlit")
+OpenAIParams = namedtuple(
+    "open_ai_params",
+    ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
+)
+def set_openai_params(openai_params: OpenAIParams):
     """Set the OpenAI API key.
+    :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
+    The latter only relate to Azure OpenAI deployments.
     """
+    openai.api_key = openai_params.openai_key
+    openai.api_version = openai_params.api_version
+    if openai_params.api_base:
+        openai.api_base = openai_params.api_base
+        openai.api_type = openai_params.api_type
 def call_completion_model(
+    prompt: str,
+    model: str = "text-davinci-003",
+    max_tokens: int = 512,
+    deployment_id: Optional[str] = None,
 ) -> str:
     """Creates a request for the OpenAI Completion service and returns the response.
     :param prompt: The prompt for the completion model
     :param model: OpenAI model name
     :param max_tokens: Model's max_tokens parameter
+    :param deployment_id: Azure OpenAI deployment ID
     """
+    if deployment_id:
+        response = openai.Completion.create(
+            deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
+        )
+    else:
+        response = openai.Completion.create(
+            model=model, prompt=prompt, max_tokens=max_tokens
+        )
     return response["choices"][0].text

presidio_helpers.py CHANGED Viewed

@@ -2,22 +2,24 @@
 Helper methods for the Presidio Streamlit app
 """
 from typing import List, Optional, Tuple
 import streamlit as st
 from presidio_analyzer import (
     AnalyzerEngine,
     RecognizerResult,
     RecognizerRegistry,
     PatternRecognizer,
 )
 from presidio_analyzer.nlp_engine import NlpEngine
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 from openai_fake_data_generator import (
-    set_openai_key,
     call_completion_model,
     create_prompt,
 )
 from presidio_nlp_engine_config import (
     create_nlp_engine_with_spacy,
@@ -26,6 +28,8 @@ from presidio_nlp_engine_config import (
     create_nlp_engine_with_azure_text_analytics,
 )
 @st.cache_resource
 def nlp_engine_and_registry(
@@ -109,6 +113,11 @@ def analyze(
         kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
         del kwargs["deny_list"]
     return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
         **kwargs
     )
@@ -200,22 +209,30 @@ def annotate(text: str, analyze_results: List[RecognizerResult]):
 def create_fake_data(
     text: str,
     analyze_results: List[RecognizerResult],
-    openai_key: str,
-    openai_model_name: str,
 ):
     """Creates a synthetic version of the text using OpenAI APIs"""
-    if not openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
-    set_openai_key(openai_key)
     prompt = create_prompt(results.text)
-    fake = call_openai_api(prompt, openai_model_name)
     return fake
 @st.cache_data
-def call_openai_api(prompt: str, openai_model_name: str) -> str:
-    fake_data = call_completion_model(prompt, model=openai_model_name)
     return fake_data
@@ -225,6 +242,19 @@ def create_ad_hoc_deny_list_recognizer(
     if not deny_list:
         return None
-    deny_list_recognizer = PatternRecognizer(supported_entity="GENERIC_PII", deny_list=deny_list)
-    print(deny_list_recognizer.patterns)
     return deny_list_recognizer

 Helper methods for the Presidio Streamlit app
 """
 from typing import List, Optional, Tuple
+import logging
 import streamlit as st
 from presidio_analyzer import (
     AnalyzerEngine,
     RecognizerResult,
     RecognizerRegistry,
     PatternRecognizer,
+    Pattern,
 )
 from presidio_analyzer.nlp_engine import NlpEngine
 from presidio_anonymizer import AnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig
 from openai_fake_data_generator import (
+    set_openai_params,
     call_completion_model,
     create_prompt,
+    OpenAIParams,
 )
 from presidio_nlp_engine_config import (
     create_nlp_engine_with_spacy,
     create_nlp_engine_with_azure_text_analytics,
 )
+logger = logging.getLogger("presidio-streamlit")
 @st.cache_resource
 def nlp_engine_and_registry(
         kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
         del kwargs["deny_list"]
+    if "regex_params" in kwargs and len(kwargs["regex_params"]) > 0:
+        ad_hoc_recognizer = create_ad_hoc_regex_recognizer(*kwargs["regex_params"])
+        kwargs["ad_hoc_recognizers"] = [ad_hoc_recognizer] if ad_hoc_recognizer else []
+        del kwargs["regex_params"]
     return analyzer_engine(model_family, model_path, ta_key, ta_endpoint).analyze(
         **kwargs
     )
 def create_fake_data(
     text: str,
     analyze_results: List[RecognizerResult],
+    openai_params: OpenAIParams,
 ):
     """Creates a synthetic version of the text using OpenAI APIs"""
+    if not openai_params.openai_key:
         return "Please provide your OpenAI key"
     results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
+    set_openai_params(openai_params)
     prompt = create_prompt(results.text)
+    print(f"Prompt: {prompt}")
+    fake = call_openai_api(
+        prompt=prompt,
+        openai_model_name=openai_params.model,
+        openai_deployment_name=openai_params.deployment_name,
+    )
     return fake
 @st.cache_data
+def call_openai_api(
+    prompt: str, openai_model_name: str, openai_deployment_name: Optional[str] = None
+) -> str:
+    fake_data = call_completion_model(
+        prompt, model=openai_model_name, deployment_id=openai_deployment_name
+    )
     return fake_data
     if not deny_list:
         return None
+    deny_list_recognizer = PatternRecognizer(
+        supported_entity="GENERIC_PII", deny_list=deny_list
+    )
     return deny_list_recognizer
+def create_ad_hoc_regex_recognizer(
+    regex: str, entity_type: str, score: float, context: Optional[List[str]] = None
+) -> Optional[PatternRecognizer]:
+    if not regex:
+        return None
+    pattern = Pattern(name="Regex pattern", regex=regex, score=score)
+    regex_recognizer = PatternRecognizer(
+        supported_entity=entity_type, patterns=[pattern], context=context
+    )
+    return regex_recognizer

presidio_nlp_engine_config.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from typing import Tuple
 import spacy
 from presidio_analyzer import RecognizerRegistry
 from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
 def create_nlp_engine_with_spacy(
     model_path: str,

 from typing import Tuple
+import logging
 import spacy
 from presidio_analyzer import RecognizerRegistry
 from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
+logger = logging.getLogger("presidio-streamlit")
 def create_nlp_engine_with_spacy(
     model_path: str,

presidio_streamlit.py CHANGED Viewed

@@ -1,13 +1,15 @@
 """Streamlit app for Presidio."""
 import os
 import pandas as pd
 import streamlit as st
 import streamlit.components.v1 as components
 from annotated_text import annotated_text
 from streamlit_tags import st_tags
 from presidio_helpers import (
     get_supported_entities,
     analyze,
@@ -18,29 +20,30 @@ from presidio_helpers import (
     nlp_engine_and_registry,
 )
-st.set_page_config(page_title="Presidio demo", layout="wide")
 # Sidebar
 st.sidebar.header(
     """
-PII De-Identification with Microsoft Presidio
 """
 )
-st.sidebar.info(
-    "Presidio is an open source customizable framework for PII detection and de-identification\n"
-    "[Code](https://aka.ms/presidio) | "
-    "[Tutorial](https://microsoft.github.io/presidio/tutorial/) | "
-    "[Installation](https://microsoft.github.io/presidio/installation/) | "
-    "[FAQ](https://microsoft.github.io/presidio/faq/)",
-    icon="ℹ️",
-)
-st.sidebar.markdown(
-    "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"  # noqa
-    "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
-    "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
-)
 model_help_text = """
     Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
@@ -48,51 +51,56 @@ model_help_text = """
     as well as service such as Azure Text Analytics PII.
     """
 st_ta_key = st_ta_endpoint = ""
-st_model = "en_core_web_lg"
-st_model_package = st.sidebar.selectbox(
     "NER model package",
-    ["spaCy", "flair", "HuggingFace", "Azure Text Analytics"],
     index=2,
-    help="Select the NLP package to use for PII detection",
 )
-if st_model_package == "spaCy":
-    st_model = st.sidebar.selectbox(
-        "NER model for PII detection",
-        ["en_core_web_lg", "en_core_web_trf", "Other"],
-        help=model_help_text,
-    )
-elif st_model_package == "HuggingFace":
-    st_model = st.sidebar.selectbox(
-        "NER model for PII detection",
-        ["obi/deid_roberta_i2b2", "StanfordAIMI/stanford-deidentifier-base", "Other"],
-        help=model_help_text,
-    )
-elif st_model_package == "flair":
-    st_model = st.sidebar.selectbox(
-        "NER model for PII detection",
-        ["flair/ner-english-large", "Other"],
-        help=model_help_text,
-    )
-elif st_model_package == "Azure Text Analytics":
-    st_model = st.sidebar.selectbox(
-        "NER model for PII detection",
-        ["Azure Text Analytics PII"],
-        help=model_help_text,
-    )
-    st_ta_key = st.sidebar.text_input("Text Analytics Key", type="password")
-    st_ta_endpoint = st.sidebar.text_input("Text Analytics Endpoint")
 if st_model == "Other":
-    st_model = st.sidebar.text_input(
-        f"NER model name for package {st_model_package}", value=""
     )
 st.sidebar.warning("Note: Models might take some time to download. ")
 analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
 st_operator = st.sidebar.selectbox(
     "De-identification approach",
@@ -112,8 +120,10 @@ st_operator = st.sidebar.selectbox(
 st_mask_char = "*"
 st_number_of_chars = 15
 st_encrypt_key = "WmZq4t7w!z%C&F)J"
-st_openai_key = ""
-st_openai_model = "text-davinci-003"
 if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
@@ -125,6 +135,22 @@ if st_operator == "mask":
 elif st_operator == "encrypt":
     st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
 elif st_operator == "synthesize":
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
         value=os.getenv("OPENAI_KEY", default=""),
@@ -133,9 +159,21 @@ elif st_operator == "synthesize":
     )
     st_openai_model = st.sidebar.text_input(
         "OpenAI model for text synthesis",
-        value=st_openai_model,
         help="See more here: https://platform.openai.com/docs/models/",
     )
 st_threshold = st.sidebar.slider(
     label="Acceptance threshold",
     min_value=0.0,
@@ -153,24 +191,60 @@ st_return_decision_process = st.sidebar.checkbox(
 # Allow and deny lists
 st_deny_allow_expander = st.sidebar.expander(
-    "Allow and deny lists",
     expanded=False,
 )
 with st_deny_allow_expander:
-    st_allow_list = st_tags(label="Add words to the allow list", text="Enter word and press enter.")
-    st.caption('Allow lists contain words that are not considered PII, but are detected as such.')
-    st_deny_list = st_tags(label="Add words to the deny list", text="Enter word and press enter.")
-    st.caption("Deny lists contain words that are considered PII, but are not detected as such.")
 # Main panel
 analyzer_load_state = st.info("Starting Presidio analyzer...")
 nlp_engine, registry = nlp_engine_and_registry(*analyzer_params)
-analyzer = analyzer_engine(*analyzer_params)
 analyzer_load_state.empty()
 # Choose entities
 st_entities_expander = st.sidebar.expander("Choose entities to look for")
 st_entities = st_entities_expander.multiselect(
@@ -182,6 +256,12 @@ st_entities = st_entities_expander.multiselect(
     "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
 )
 # Read default text
 with open("demo_text.txt") as f:
     demo_text = f.readlines()
@@ -190,11 +270,9 @@ with open("demo_text.txt") as f:
 col1, col2 = st.columns(2)
 # Before:
-col1.subheader("Input string:")
 st_text = col1.text_area(
-    label="Enter text",
-    value="".join(demo_text),
-    height=400,
 )
@@ -210,62 +288,65 @@ st_analyze_results = analyze(
 )
 # After
-if st_operator not in ("highlight", "synthesize"):
-    with col2:
-        st.subheader(f"Output")
-        st_anonymize_results = anonymize(
-            text=st_text,
-            operator=st_operator,
-            mask_char=st_mask_char,
-            number_of_chars=st_number_of_chars,
-            encrypt_key=st_encrypt_key,
-            analyze_results=st_analyze_results,
-        )
-        st.text_area(label="De-identified", value=st_anonymize_results.text, height=400)
-elif st_operator == "synthesize":
-    with col2:
-        st.subheader(f"OpenAI Generated output")
-        fake_data = create_fake_data(
-            st_text,
-            st_analyze_results,
-            openai_key=st_openai_key,
-            openai_model_name=st_openai_model,
-        )
-        st.text_area(label="Synthetic data", value=fake_data, height=400)
-else:
-    st.subheader("Highlighted")
-    annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
-    # annotated_tokens
-    annotated_text(*annotated_tokens)
-# table result
-st.subheader(
-    "Findings" if not st_return_decision_process else "Findings with decision factors"
-)
-if st_analyze_results:
-    df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
-    df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
-    df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
-        {
-            "entity_type": "Entity type",
-            "text": "Text",
-            "start": "Start",
-            "end": "End",
-            "score": "Confidence",
-        },
-        axis=1,
     )
-    df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
-    if st_return_decision_process:
-        analysis_explanation_df = pd.DataFrame.from_records(
-            [r.analysis_explanation.to_dict() for r in st_analyze_results]
         )
-        df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
-    st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
-else:
-    st.text("No findings")
 components.html(
     """

 """Streamlit app for Presidio."""
+import logging
 import os
 import pandas as pd
 import streamlit as st
 import streamlit.components.v1 as components
+import dotenv
 from annotated_text import annotated_text
 from streamlit_tags import st_tags
+from openai_fake_data_generator import OpenAIParams
 from presidio_helpers import (
     get_supported_entities,
     analyze,
     nlp_engine_and_registry,
 )
+st.set_page_config(
+    page_title="Presidio demo",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        "About": "https://microsoft.github.io/presidio/",
+    },
+)
+dotenv.load_dotenv()
+logger = logging.getLogger("presidio-streamlit")
+allow_other_models = os.getenv("ALLOW_OTHER_MODELS", False)
+can_present_results = True
 # Sidebar
 st.sidebar.header(
     """
+PII De-Identification with [Microsoft Presidio](https://microsoft.github.io/presidio/)
 """
 )
 model_help_text = """
     Select which Named Entity Recognition (NER) model to use for PII detection, in parallel to rule-based recognizers.
     as well as service such as Azure Text Analytics PII.
     """
 st_ta_key = st_ta_endpoint = ""
+model_list = [
+    "spaCy/en_core_web_lg",
+    "flair/ner-english-large",
+    "HuggingFace/obi/deid_roberta_i2b2",
+    "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
+    "Azure Text Analytics PII",
+    "Other",
+]
+if allow_other_models:
+    model_list.pop()
+# Select model
+st_model = st.sidebar.selectbox(
     "NER model package",
+    model_list,
     index=2,
+    help=model_help_text,
 )
+# Extract model package.
+st_model_package = st_model.split("/")[0]
+# Remove package prefix (if needed)
+st_model = (
+    st_model
+    if st_model_package not in ("spaCy", "HuggingFace")
+    else "/".join(st_model.split("/")[1:])
+)
 if st_model == "Other":
+    st_model_package = st.sidebar.selectbox(
+        "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
+    )
+    st_model = st.sidebar.text_input(f"NER model name", value="")
+if st_model == "Azure Text Analytics PII":
+    st_ta_key = st.sidebar.text_input(
+        f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
+    )
+    st_ta_endpoint = st.sidebar.text_input(
+        f"Text Analytics endpoint",
+        value=os.getenv("TA_ENDPOINT", default=""),
+        help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview",  # noqa: E501
     )
 st.sidebar.warning("Note: Models might take some time to download. ")
 analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
+logger.debug(f"analyzer_params: {analyzer_params}")
 st_operator = st.sidebar.selectbox(
     "De-identification approach",
 st_mask_char = "*"
 st_number_of_chars = 15
 st_encrypt_key = "WmZq4t7w!z%C&F)J"
+open_ai_params = None
+logger.debug(f"st_operator: {st_operator}")
 if st_operator == "mask":
     st_number_of_chars = st.sidebar.number_input(
 elif st_operator == "encrypt":
     st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
 elif st_operator == "synthesize":
+    if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
+        openai_api_type = "azure"
+        st_openai_api_base = st.sidebar.text_input(
+            "Azure OpenAI base URL",
+            value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
+        )
+        st_deployment_name = st.sidebar.text_input(
+            "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
+        )
+        st_openai_version = st.sidebar.text_input(
+            "OpenAI version",
+            value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
+        )
+    else:
+        st_openai_version = openai_api_type = st_openai_api_base = None
+        st_deployment_name = ""
     st_openai_key = st.sidebar.text_input(
         "OPENAI_KEY",
         value=os.getenv("OPENAI_KEY", default=""),
     )
     st_openai_model = st.sidebar.text_input(
         "OpenAI model for text synthesis",
+        value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
         help="See more here: https://platform.openai.com/docs/models/",
     )
+    open_ai_params = OpenAIParams(
+        openai_key=st_openai_key,
+        model=st_openai_model,
+        api_base=st_openai_api_base,
+        deployment_name=st_deployment_name,
+        api_version=st_openai_version,
+        api_type=openai_api_type,
+    )
+    can_present_results = True if st_openai_key else False
 st_threshold = st.sidebar.slider(
     label="Acceptance threshold",
     min_value=0.0,
 # Allow and deny lists
 st_deny_allow_expander = st.sidebar.expander(
+    "Allowlists and denylists",
     expanded=False,
 )
 with st_deny_allow_expander:
+    st_allow_list = st_tags(
+        label="Add words to the allowlist", text="Enter word and press enter."
+    )
+    st.caption(
+        "Allowlists contain words that are not considered PII, but are detected as such."
+    )
+    st_deny_list = st_tags(
+        label="Add words to the denylist", text="Enter word and press enter."
+    )
+    st.caption(
+        "Denylists contain words that are considered PII, but are not detected as such."
+    )
 # Main panel
+with st.expander("About this demo", expanded=False):
+    st.info(
+        """Presidio is an open source customizable framework for PII detection and de-identification.
+        \n\n[Code](https://aka.ms/presidio) |
+        [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
+        [Installation](https://microsoft.github.io/presidio/installation/) |
+        [FAQ](https://microsoft.github.io/presidio/faq/) |"""
+    )
+    st.info(
+        """
+    Use this demo to:
+    - Experiment with different off-the-shelf models and NLP packages.
+    - Explore the different de-identification options, including redaction, masking, encryption and more.
+    - Generate synthetic text with Microsoft Presidio and OpenAI.
+    - Configure allow and deny lists.
+    This demo website shows some of Presidio's capabilities.
+    [Visit our website](https://microsoft.github.io/presidio) for more info,
+    samples and deployment options.
+    """
+    )
+    st.markdown(
+        "[![Pypi Downloads](https://img.shields.io/pypi/dm/presidio-analyzer.svg)](https://img.shields.io/pypi/dm/presidio-analyzer.svg)"  # noqa
+        "[![MIT license](https://img.shields.io/badge/license-MIT-brightgreen.svg)](https://opensource.org/licenses/MIT)"
+        "![GitHub Repo stars](https://img.shields.io/github/stars/microsoft/presidio?style=social)"
+    )
 analyzer_load_state = st.info("Starting Presidio analyzer...")
 nlp_engine, registry = nlp_engine_and_registry(*analyzer_params)
 analyzer_load_state.empty()
 # Choose entities
 st_entities_expander = st.sidebar.expander("Choose entities to look for")
 st_entities = st_entities_expander.multiselect(
     "More information can be found here: https://microsoft.github.io/presidio/analyzer/adding_recognizers/",
 )
+analyzer_load_state = st.info("Starting Presidio analyzer...")
+analyzer = analyzer_engine(*analyzer_params)
+analyzer_load_state.empty()
 # Read default text
 with open("demo_text.txt") as f:
     demo_text = f.readlines()
 col1, col2 = st.columns(2)
 # Before:
+col1.subheader("Input")
 st_text = col1.text_area(
+    label="Enter text", value="".join(demo_text), height=400, key="text_input"
 )
 )
 # After
+if can_present_results:
+    if st_operator not in ("highlight", "synthesize"):
+        with col2:
+            st.subheader(f"Output")
+            st_anonymize_results = anonymize(
+                text=st_text,
+                operator=st_operator,
+                mask_char=st_mask_char,
+                number_of_chars=st_number_of_chars,
+                encrypt_key=st_encrypt_key,
+                analyze_results=st_analyze_results,
+            )
+            st.text_area(
+                label="De-identified", value=st_anonymize_results.text, height=400
+            )
+    elif st_operator == "synthesize":
+        with col2:
+            st.subheader(f"OpenAI Generated output")
+            fake_data = create_fake_data(
+                st_text,
+                st_analyze_results,
+                open_ai_params,
+            )
+            st.text_area(label="Synthetic data", value=fake_data, height=400)
+    else:
+        st.subheader("Highlighted")
+        annotated_tokens = annotate(text=st_text, analyze_results=st_analyze_results)
+        # annotated_tokens
+        annotated_text(*annotated_tokens)
+    # table result
+    st.subheader(
+        "Findings"
+        if not st_return_decision_process
+        else "Findings with decision factors"
     )
+    if st_analyze_results:
+        df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
+        df["text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+        df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
+            {
+                "entity_type": "Entity type",
+                "text": "Text",
+                "start": "Start",
+                "end": "End",
+                "score": "Confidence",
+            },
+            axis=1,
         )
+        df_subset["Text"] = [st_text[res.start : res.end] for res in st_analyze_results]
+        if st_return_decision_process:
+            analysis_explanation_df = pd.DataFrame.from_records(
+                [r.analysis_explanation.to_dict() for r in st_analyze_results]
+            )
+            df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
+        st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
+    else:
+        st.text("No findings")
 components.html(
     """

requirements.txt CHANGED Viewed

@@ -3,6 +3,7 @@ presidio-anonymizer
 streamlit
 streamlit-tags
 pandas
 st-annotated-text
 torch
 transformers

 streamlit
 streamlit-tags
 pandas
+dotenv
 st-annotated-text
 torch
 transformers

text_analytics_wrapper.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from typing import List, Optional
 import dotenv
 from azure.ai.textanalytics import TextAnalyticsClient
 from azure.core.credentials import AzureKeyCredential
@@ -8,6 +8,8 @@ from azure.core.credentials import AzureKeyCredential
 from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
 from presidio_analyzer.nlp_engine import NlpArtifacts
 class TextAnalyticsWrapper(EntityRecognizer):
     from azure.ai.textanalytics._models import PiiEntityCategory
     TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]

 import os
 from typing import List, Optional
+import logging
 import dotenv
 from azure.ai.textanalytics import TextAnalyticsClient
 from azure.core.credentials import AzureKeyCredential
 from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
 from presidio_analyzer.nlp_engine import NlpArtifacts
+logger = logging.getLogger("presidio-streamlit")
 class TextAnalyticsWrapper(EntityRecognizer):
     from azure.ai.textanalytics._models import PiiEntityCategory
     TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]