presidio commited on
Commit
57594ac
·
1 Parent(s): b7be871

Upload 12 files (#2)

Browse files

- Upload 12 files (17f243fd769fde0cdd472eb6975d3f2d8e55c8f7)

azure_ai_language_wrapper.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List, Optional
3
+ import logging
4
+ import dotenv
5
+ from azure.ai.textanalytics import TextAnalyticsClient
6
+ from azure.core.credentials import AzureKeyCredential
7
+
8
+ from presidio_analyzer import EntityRecognizer, RecognizerResult, AnalysisExplanation
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts
10
+
11
+ logger = logging.getLogger("presidio-streamlit")
12
+
13
+
14
+ class AzureAIServiceWrapper(EntityRecognizer):
15
+ from azure.ai.textanalytics._models import PiiEntityCategory
16
+
17
+ TA_SUPPORTED_ENTITIES = [r.value for r in PiiEntityCategory]
18
+
19
+ def __init__(
20
+ self,
21
+ supported_entities: Optional[List[str]] = None,
22
+ supported_language: str = "en",
23
+ ta_client: Optional[TextAnalyticsClient] = None,
24
+ ta_key: Optional[str] = None,
25
+ ta_endpoint: Optional[str] = None,
26
+ ):
27
+ """
28
+ Wrapper for the Azure Text Analytics client
29
+ :param ta_client: object of type TextAnalyticsClient
30
+ :param ta_key: Azure cognitive Services for Language key
31
+ :param ta_endpoint: Azure cognitive Services for Language endpoint
32
+ """
33
+
34
+ if not supported_entities:
35
+ supported_entities = self.TA_SUPPORTED_ENTITIES
36
+
37
+ super().__init__(
38
+ supported_entities=supported_entities,
39
+ supported_language=supported_language,
40
+ name="Azure AI Language PII",
41
+ )
42
+
43
+ self.ta_key = ta_key
44
+ self.ta_endpoint = ta_endpoint
45
+
46
+ if not ta_client:
47
+ ta_client = self.__authenticate_client(ta_key, ta_endpoint)
48
+ self.ta_client = ta_client
49
+
50
+ @staticmethod
51
+ def __authenticate_client(key: str, endpoint: str):
52
+ ta_credential = AzureKeyCredential(key)
53
+ text_analytics_client = TextAnalyticsClient(
54
+ endpoint=endpoint, credential=ta_credential
55
+ )
56
+ return text_analytics_client
57
+
58
+ def analyze(
59
+ self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
60
+ ) -> List[RecognizerResult]:
61
+ if not entities:
62
+ entities = []
63
+ response = self.ta_client.recognize_pii_entities(
64
+ [text], language=self.supported_language
65
+ )
66
+ results = [doc for doc in response if not doc.is_error]
67
+ recognizer_results = []
68
+ for res in results:
69
+ for entity in res.entities:
70
+ if entity.category not in self.supported_entities:
71
+ continue
72
+ analysis_explanation = AzureAIServiceWrapper._build_explanation(
73
+ original_score=entity.confidence_score,
74
+ entity_type=entity.category,
75
+ )
76
+ recognizer_results.append(
77
+ RecognizerResult(
78
+ entity_type=entity.category,
79
+ start=entity.offset,
80
+ end=entity.offset + len(entity.text),
81
+ score=entity.confidence_score,
82
+ analysis_explanation=analysis_explanation,
83
+ )
84
+ )
85
+
86
+ return recognizer_results
87
+
88
+ @staticmethod
89
+ def _build_explanation(
90
+ original_score: float, entity_type: str
91
+ ) -> AnalysisExplanation:
92
+ explanation = AnalysisExplanation(
93
+ recognizer=AzureAIServiceWrapper.__class__.__name__,
94
+ original_score=original_score,
95
+ textual_explanation=f"Identified as {entity_type} by Text Analytics",
96
+ )
97
+ return explanation
98
+
99
+ def load(self) -> None:
100
+ pass
101
+
102
+
103
+ if __name__ == "__main__":
104
+ import presidio_helpers
105
+
106
+ dotenv.load_dotenv()
107
+ text = """
108
+ Here are a few example sentences we currently support:
109
+
110
+ Hello, my name is David Johnson and I live in Maine.
111
+ My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.
112
+
113
+ On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.
114
+
115
+ My passport: 191280342 and my phone number: (212) 555-1234.
116
+
117
+ This is a valid International Bank Account Number: IL150120690000003111111 . Can you please check the status on bank account 954567876544?
118
+
119
+ Kate's social security number is 078-05-1126. Her driver license? it is 1234567A.
120
+ """
121
+ analyzer = presidio_helpers.analyzer_engine(
122
+ model_path="Azure Text Analytics PII",
123
+ ta_key=os.environ["TA_KEY"],
124
+ ta_endpoint=os.environ["TA_ENDPOINT"],
125
+ )
126
+ analyzer.analyze(text=text, language="en")
flair_recognizer.py CHANGED
@@ -59,9 +59,7 @@ class FlairRecognizer(EntityRecognizer):
59
  # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
60
  ]
61
 
62
- MODEL_LANGUAGES = {
63
- "en": "flair/ner-english-large"
64
- }
65
 
66
  PRESIDIO_EQUIVALENCES = {
67
  "PER": "PERSON",
@@ -76,7 +74,7 @@ class FlairRecognizer(EntityRecognizer):
76
  supported_entities: Optional[List[str]] = None,
77
  check_label_groups: Optional[Tuple[Set, Set]] = None,
78
  model: SequenceTagger = None,
79
- model_path: Optional[str] = None
80
  ):
81
  self.check_label_groups = (
82
  check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
@@ -93,7 +91,9 @@ class FlairRecognizer(EntityRecognizer):
93
  self.model = SequenceTagger.load(model_path)
94
  else:
95
  print(f"Loading model for language {supported_language}")
96
- self.model = SequenceTagger.load(self.MODEL_LANGUAGES.get(supported_language))
 
 
97
 
98
  super().__init__(
99
  supported_entities=supported_entities,
 
59
  # ({"MISCELLANEOUS"}, {"MISC"}), # Probably not PII
60
  ]
61
 
62
+ MODEL_LANGUAGES = {"en": "flair/ner-english-large"}
 
 
63
 
64
  PRESIDIO_EQUIVALENCES = {
65
  "PER": "PERSON",
 
74
  supported_entities: Optional[List[str]] = None,
75
  check_label_groups: Optional[Tuple[Set, Set]] = None,
76
  model: SequenceTagger = None,
77
+ model_path: Optional[str] = None,
78
  ):
79
  self.check_label_groups = (
80
  check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
 
91
  self.model = SequenceTagger.load(model_path)
92
  else:
93
  print(f"Loading model for language {supported_language}")
94
+ self.model = SequenceTagger.load(
95
+ self.MODEL_LANGUAGES.get(supported_language)
96
+ )
97
 
98
  super().__init__(
99
  supported_entities=supported_entities,
flair_test.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import generic wrappers
2
+ from transformers import AutoModel, AutoTokenizer
3
+
4
+
5
+ if __name__ == "__main__":
6
+ from flair.data import Sentence
7
+ from flair.models import SequenceTagger
8
+
9
+ # load tagger
10
+ tagger = SequenceTagger.load("flair/ner-english-large")
11
+
12
+ # make example sentence
13
+ sentence = Sentence("George Washington went to Washington")
14
+
15
+ # predict NER tags
16
+ tagger.predict(sentence)
17
+
18
+ # print sentence
19
+ print(sentence)
20
+
21
+ # print predicted NER spans
22
+ print("The following NER tags are found:")
23
+ # iterate over entities and print
24
+ for entity in sentence.get_spans("ner"):
25
+ print(entity)
index.md CHANGED
@@ -5,22 +5,32 @@ The app is based on the [streamlit](https://streamlit.io/) package.
5
  A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
6
 
7
  ## Requirements
8
- 1. Clone the repo and move to the `docs/samples/python/streamlit ` folder
9
- 1. Install dependencies (preferably in a virtual environment)
10
 
11
  ```sh
12
  pip install -r requirements
13
  ```
14
  > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
15
 
16
- 2.
17
  3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
18
- 3. Start the app:
19
 
20
  ```sh
21
  streamlit run presidio_streamlit.py
22
  ```
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  ## Output
25
  Output should be similar to this screenshot:
26
- ![image](https://user-images.githubusercontent.com/3776619/232289541-d59992e1-52a4-44c1-b904-b22c72c02a5b.png)
 
5
  A live version can be found here: https://huggingface.co/spaces/presidio/presidio_demo
6
 
7
  ## Requirements
8
+ 1. Clone the repo and move to the `docs/samples/python/streamlit` folder
9
+ 2. Install dependencies (preferably in a virtual environment)
10
 
11
  ```sh
12
  pip install -r requirements
13
  ```
14
  > Note: This would install additional packages such as `transformers` and `flair` which are not mandatory for using Presidio.
15
 
 
16
  3. *Optional*: Update the `analyzer_engine` and `anonymizer_engine` functions for your specific implementation (in `presidio_helpers.py`).
17
+ 4. Start the app:
18
 
19
  ```sh
20
  streamlit run presidio_streamlit.py
21
  ```
22
 
23
+ 5. Consider adding an `.env` file with the following environment variables, for further customizability:
24
+ ```sh
25
+ TA_KEY=YOUR_TEXT_ANALYTICS_KEY
26
+ TA_ENDPOINT=YOUR_TEXT_ANALYTICS_ENDPOINT
27
+ OPENAI_TYPE="Azure" #or "openai"
28
+ OPENAI_KEY=YOUR_OPENAI_KEY
29
+ OPENAI_API_VERSION = "2023-05-15"
30
+ AZURE_OPENAI_ENDPOINT=YOUR_AZURE_OPENAI_AZURE_OPENAI_ENDPOINT
31
+ AZURE_OPENAI_DEPLOYMENT=text-davinci-003
32
+ ALLOW_OTHER_MODELS=true #true if the user could download new models
33
+ ```
34
  ## Output
35
  Output should be similar to this screenshot:
36
+ ![image](https://github.com/microsoft/presidio/assets/3776619/7d0eadf1-e750-4747-8b59-8203aa43cac8)
openai_fake_data_generator.py CHANGED
@@ -2,51 +2,45 @@ from collections import namedtuple
2
  from typing import Optional
3
 
4
  import openai
 
5
  import logging
6
 
7
  logger = logging.getLogger("presidio-streamlit")
8
 
9
  OpenAIParams = namedtuple(
10
  "open_ai_params",
11
- ["openai_key", "model", "api_base", "deployment_name", "api_version", "api_type"],
12
  )
13
 
14
 
15
- def set_openai_params(openai_params: OpenAIParams):
16
- """Set the OpenAI API key.
17
- :param openai_params: OpenAIParams object with the following fields: key, model, api version, deployment_name,
18
- The latter only relate to Azure OpenAI deployments.
19
- """
20
- openai.api_key = openai_params.openai_key
21
- openai.api_version = openai_params.api_version
22
- if openai_params.api_base:
23
- openai.api_base = openai_params.api_base
24
- openai.api_type = openai_params.api_type
25
-
26
-
27
  def call_completion_model(
28
  prompt: str,
29
- model: str = "text-davinci-003",
30
- max_tokens: int = 512,
31
- deployment_id: Optional[str] = None,
32
  ) -> str:
33
  """Creates a request for the OpenAI Completion service and returns the response.
34
 
35
  :param prompt: The prompt for the completion model
36
- :param model: OpenAI model name
37
- :param max_tokens: Model's max_tokens parameter
38
- :param deployment_id: Azure OpenAI deployment ID
39
  """
40
- if deployment_id:
41
- response = openai.Completion.create(
42
- deployment_id=deployment_id, model=model, prompt=prompt, max_tokens=max_tokens
 
 
 
43
  )
44
  else:
45
- response = openai.Completion.create(
46
- model=model, prompt=prompt, max_tokens=max_tokens
47
- )
48
 
49
- return response["choices"][0].text
 
 
 
 
 
 
50
 
51
 
52
  def create_prompt(anonymized_text: str) -> str:
@@ -64,17 +58,18 @@ def create_prompt(anonymized_text: str) -> str:
64
 
65
  a. Use completely random numbers, so every digit is drawn between 0 and 9.
66
  b. Use realistic names that come from diverse genders, ethnicities and countries.
67
- c. If there are no placeholders, return the text as is and provide an answer.
68
  d. Keep the formatting as close to the original as possible.
69
  e. If PII exists in the input, replace it with fake values in the output.
 
70
 
71
- input: How do I change the limit on my credit card {{credit_card_number}}?
72
  output: How do I change the limit on my credit card 2539 3519 2345 1555?
73
- input: <PERSON> was the chief science officer at <ORGANIZATION>.
74
  output: Katherine Buckjov was the chief science officer at NASA.
75
- input: Cameroon lives in <LOCATION>.
76
  output: Vladimir lives in Moscow.
77
- input: {anonymized_text}
78
- output:
79
- """
80
  return prompt
 
2
  from typing import Optional
3
 
4
  import openai
5
+ from openai import OpenAI, AzureOpenAI
6
  import logging
7
 
8
  logger = logging.getLogger("presidio-streamlit")
9
 
10
  OpenAIParams = namedtuple(
11
  "open_ai_params",
12
+ ["openai_key", "model", "api_base", "deployment_id", "api_version", "api_type"],
13
  )
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def call_completion_model(
17
  prompt: str,
18
+ openai_params: OpenAIParams,
19
+ max_tokens: Optional[int] = 256,
 
20
  ) -> str:
21
  """Creates a request for the OpenAI Completion service and returns the response.
22
 
23
  :param prompt: The prompt for the completion model
24
+ :param openai_params: OpenAI parameters for the completion model
25
+ :param max_tokens: The maximum number of tokens to generate.
 
26
  """
27
+ if openai_params.api_type.lower() == "azure":
28
+ client = AzureOpenAI(
29
+ api_version=openai_params.api_version,
30
+ api_key=openai_params.openai_key,
31
+ azure_endpoint=openai_params.api_base,
32
+ azure_deployment=openai_params.deployment_id,
33
  )
34
  else:
35
+ client = OpenAI(api_key=openai_params.openai_key)
 
 
36
 
37
+ response = client.completions.create(
38
+ model=openai_params.model,
39
+ prompt=prompt,
40
+ max_tokens=max_tokens,
41
+ )
42
+
43
+ return response.choices[0].text.strip()
44
 
45
 
46
  def create_prompt(anonymized_text: str) -> str:
 
58
 
59
  a. Use completely random numbers, so every digit is drawn between 0 and 9.
60
  b. Use realistic names that come from diverse genders, ethnicities and countries.
61
+ c. If there are no placeholders, return the text as is.
62
  d. Keep the formatting as close to the original as possible.
63
  e. If PII exists in the input, replace it with fake values in the output.
64
+ f. Remove whitespace before and after the generated text
65
 
66
+ input: [[TEXT STARTS]] How do I change the limit on my credit card {{credit_card_number}}?[[TEXT ENDS]]
67
  output: How do I change the limit on my credit card 2539 3519 2345 1555?
68
+ input: [[TEXT STARTS]]<PERSON> was the chief science officer at <ORGANIZATION>.[[TEXT ENDS]]
69
  output: Katherine Buckjov was the chief science officer at NASA.
70
+ input: [[TEXT STARTS]]Cameroon lives in <LOCATION>.[[TEXT ENDS]]
71
  output: Vladimir lives in Moscow.
72
+
73
+ input: [[TEXT STARTS]]{anonymized_text}[[TEXT ENDS]]
74
+ output:"""
75
  return prompt
presidio_helpers.py CHANGED
@@ -16,16 +16,16 @@ from presidio_anonymizer import AnonymizerEngine
16
  from presidio_anonymizer.entities import OperatorConfig
17
 
18
  from openai_fake_data_generator import (
19
- set_openai_params,
20
  call_completion_model,
21
- create_prompt,
22
  OpenAIParams,
 
23
  )
24
  from presidio_nlp_engine_config import (
25
  create_nlp_engine_with_spacy,
26
  create_nlp_engine_with_flair,
27
  create_nlp_engine_with_transformers,
28
- create_nlp_engine_with_azure_text_analytics,
 
29
  )
30
 
31
  logger = logging.getLogger("presidio-streamlit")
@@ -49,14 +49,16 @@ def nlp_engine_and_registry(
49
  """
50
 
51
  # Set up NLP Engine according to the model of choice
52
- if "spaCy" in model_family:
53
  return create_nlp_engine_with_spacy(model_path)
54
- elif "flair" in model_family:
 
 
55
  return create_nlp_engine_with_flair(model_path)
56
- elif "HuggingFace" in model_family:
57
  return create_nlp_engine_with_transformers(model_path)
58
- elif "Azure Text Analytics" in model_family:
59
- return create_nlp_engine_with_azure_text_analytics(ta_key, ta_endpoint)
60
  else:
61
  raise ValueError(f"Model family {model_family} not supported")
62
 
@@ -215,14 +217,9 @@ def create_fake_data(
215
  if not openai_params.openai_key:
216
  return "Please provide your OpenAI key"
217
  results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
218
- set_openai_params(openai_params)
219
  prompt = create_prompt(results.text)
220
  print(f"Prompt: {prompt}")
221
- fake = call_openai_api(
222
- prompt=prompt,
223
- openai_model_name=openai_params.model,
224
- openai_deployment_name=openai_params.deployment_name,
225
- )
226
  return fake
227
 
228
 
 
16
  from presidio_anonymizer.entities import OperatorConfig
17
 
18
  from openai_fake_data_generator import (
 
19
  call_completion_model,
 
20
  OpenAIParams,
21
+ create_prompt,
22
  )
23
  from presidio_nlp_engine_config import (
24
  create_nlp_engine_with_spacy,
25
  create_nlp_engine_with_flair,
26
  create_nlp_engine_with_transformers,
27
+ create_nlp_engine_with_azure_ai_language,
28
+ create_nlp_engine_with_stanza,
29
  )
30
 
31
  logger = logging.getLogger("presidio-streamlit")
 
49
  """
50
 
51
  # Set up NLP Engine according to the model of choice
52
+ if "spacy" in model_family.lower():
53
  return create_nlp_engine_with_spacy(model_path)
54
+ if "stanza" in model_family.lower():
55
+ return create_nlp_engine_with_stanza(model_path)
56
+ elif "flair" in model_family.lower():
57
  return create_nlp_engine_with_flair(model_path)
58
+ elif "huggingface" in model_family.lower():
59
  return create_nlp_engine_with_transformers(model_path)
60
+ elif "azure ai language" in model_family.lower():
61
+ return create_nlp_engine_with_azure_ai_language(ta_key, ta_endpoint)
62
  else:
63
  raise ValueError(f"Model family {model_family} not supported")
64
 
 
217
  if not openai_params.openai_key:
218
  return "Please provide your OpenAI key"
219
  results = anonymize(text=text, operator="replace", analyze_results=analyze_results)
 
220
  prompt = create_prompt(results.text)
221
  print(f"Prompt: {prompt}")
222
+ fake = call_completion_model(prompt=prompt, openai_params=openai_params)
 
 
 
 
223
  return fake
224
 
225
 
presidio_nlp_engine_config.py CHANGED
@@ -1,8 +1,12 @@
1
- from typing import Tuple
2
  import logging
 
 
3
  import spacy
4
  from presidio_analyzer import RecognizerRegistry
5
- from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
 
 
 
6
 
7
  logger = logging.getLogger("presidio-streamlit")
8
 
@@ -12,21 +16,70 @@ def create_nlp_engine_with_spacy(
12
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
13
  """
14
  Instantiate an NlpEngine with a spaCy model
15
- :param model_path: spaCy model path.
16
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  registry = RecognizerRegistry()
18
- registry.load_predefined_recognizers()
19
 
20
- if not spacy.util.is_package(model_path):
21
- spacy.cli.download(model_path)
22
 
 
 
 
 
 
 
 
 
23
  nlp_configuration = {
24
- "nlp_engine_name": "spacy",
25
  "models": [{"lang_code": "en", "model_name": model_path}],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
 
28
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
29
 
 
 
 
30
  return nlp_engine, registry
31
 
32
 
@@ -39,41 +92,62 @@ def create_nlp_engine_with_transformers(
39
  would return NlpArtifacts such as POS and lemmas.
40
  :param model_path: HuggingFace model path.
41
  """
 
42
 
43
- from transformers_rec import (
44
- STANFORD_COFIGURATION,
45
- BERT_DEID_CONFIGURATION,
46
- TransformersRecognizer,
47
- )
48
-
49
- registry = RecognizerRegistry()
50
- registry.load_predefined_recognizers()
51
-
52
- if not spacy.util.is_package("en_core_web_sm"):
53
- spacy.cli.download("en_core_web_sm")
54
- # Using a small spaCy model + a HF NER model
55
- transformers_recognizer = TransformersRecognizer(model_path=model_path)
56
-
57
- if model_path == "StanfordAIMI/stanford-deidentifier-base":
58
- transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
59
- elif model_path == "obi/deid_roberta_i2b2":
60
- transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
61
- else:
62
- print(f"Warning: Model has no configuration, loading default.")
63
- transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
64
-
65
- # Use small spaCy model, no need for both spacy and HF models
66
- # The transformers model is used here as a recognizer, not as an NlpEngine
67
  nlp_configuration = {
68
- "nlp_engine_name": "spacy",
69
- "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  }
71
 
72
- registry.add_recognizer(transformers_recognizer)
73
- registry.remove_recognizer("SpacyRecognizer")
74
-
75
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
76
 
 
 
 
77
  return nlp_engine, registry
78
 
79
 
@@ -91,6 +165,8 @@ def create_nlp_engine_with_flair(
91
  registry = RecognizerRegistry()
92
  registry.load_predefined_recognizers()
93
 
 
 
94
  if not spacy.util.is_package("en_core_web_sm"):
95
  spacy.cli.download("en_core_web_sm")
96
  # Using a small spaCy model + a Flair NER model
@@ -107,7 +183,7 @@ def create_nlp_engine_with_flair(
107
  return nlp_engine, registry
108
 
109
 
110
- def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
111
  """
112
  Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
113
  The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
@@ -115,7 +191,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
115
  :param ta_key: Azure Text Analytics key.
116
  :param ta_endpoint: Azure Text Analytics endpoint.
117
  """
118
- from text_analytics_wrapper import TextAnalyticsWrapper
119
 
120
  if not ta_key or not ta_endpoint:
121
  raise RuntimeError("Please fill in the Text Analytics endpoint details")
@@ -123,7 +199,9 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
123
  registry = RecognizerRegistry()
124
  registry.load_predefined_recognizers()
125
 
126
- ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
 
 
127
  nlp_configuration = {
128
  "nlp_engine_name": "spacy",
129
  "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
@@ -131,7 +209,7 @@ def create_nlp_engine_with_azure_text_analytics(ta_key: str, ta_endpoint: str):
131
 
132
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
133
 
134
- registry.add_recognizer(ta_recognizer)
135
  registry.remove_recognizer("SpacyRecognizer")
136
 
137
  return nlp_engine, registry
 
 
1
  import logging
2
+ from typing import Tuple
3
+
4
  import spacy
5
  from presidio_analyzer import RecognizerRegistry
6
+ from presidio_analyzer.nlp_engine import (
7
+ NlpEngine,
8
+ NlpEngineProvider,
9
+ )
10
 
11
  logger = logging.getLogger("presidio-streamlit")
12
 
 
16
  ) -> Tuple[NlpEngine, RecognizerRegistry]:
17
  """
18
  Instantiate an NlpEngine with a spaCy model
19
+ :param model_path: path to model / model name.
20
  """
21
+ nlp_configuration = {
22
+ "nlp_engine_name": "spacy",
23
+ "models": [{"lang_code": "en", "model_name": model_path}],
24
+ "ner_model_configuration": {
25
+ "model_to_presidio_entity_mapping": {
26
+ "PER": "PERSON",
27
+ "PERSON": "PERSON",
28
+ "NORP": "NRP",
29
+ "FAC": "FACILITY",
30
+ "LOC": "LOCATION",
31
+ "GPE": "LOCATION",
32
+ "LOCATION": "LOCATION",
33
+ "ORG": "ORGANIZATION",
34
+ "ORGANIZATION": "ORGANIZATION",
35
+ "DATE": "DATE_TIME",
36
+ "TIME": "DATE_TIME",
37
+ },
38
+ "low_confidence_score_multiplier": 0.4,
39
+ "low_score_entity_names": ["ORG", "ORGANIZATION"],
40
+ },
41
+ }
42
+
43
+ nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
44
+
45
  registry = RecognizerRegistry()
46
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
47
 
48
+ return nlp_engine, registry
 
49
 
50
+
51
+ def create_nlp_engine_with_stanza(
52
+ model_path: str,
53
+ ) -> Tuple[NlpEngine, RecognizerRegistry]:
54
+ """
55
+ Instantiate an NlpEngine with a stanza model
56
+ :param model_path: path to model / model name.
57
+ """
58
  nlp_configuration = {
59
+ "nlp_engine_name": "stanza",
60
  "models": [{"lang_code": "en", "model_name": model_path}],
61
+ "ner_model_configuration": {
62
+ "model_to_presidio_entity_mapping": {
63
+ "PER": "PERSON",
64
+ "PERSON": "PERSON",
65
+ "NORP": "NRP",
66
+ "FAC": "FACILITY",
67
+ "LOC": "LOCATION",
68
+ "GPE": "LOCATION",
69
+ "LOCATION": "LOCATION",
70
+ "ORG": "ORGANIZATION",
71
+ "ORGANIZATION": "ORGANIZATION",
72
+ "DATE": "DATE_TIME",
73
+ "TIME": "DATE_TIME",
74
+ }
75
+ },
76
  }
77
 
78
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
79
 
80
+ registry = RecognizerRegistry()
81
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
82
+
83
  return nlp_engine, registry
84
 
85
 
 
92
  would return NlpArtifacts such as POS and lemmas.
93
  :param model_path: HuggingFace model path.
94
  """
95
+ print(f"Loading Transformers model: {model_path} of type {type(model_path)}")
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  nlp_configuration = {
98
+ "nlp_engine_name": "transformers",
99
+ "models": [
100
+ {
101
+ "lang_code": "en",
102
+ "model_name": {"spacy": "en_core_web_sm", "transformers": model_path},
103
+ }
104
+ ],
105
+ "ner_model_configuration": {
106
+ "model_to_presidio_entity_mapping": {
107
+ "PER": "PERSON",
108
+ "PERSON": "PERSON",
109
+ "LOC": "LOCATION",
110
+ "LOCATION": "LOCATION",
111
+ "GPE": "LOCATION",
112
+ "ORG": "ORGANIZATION",
113
+ "ORGANIZATION": "ORGANIZATION",
114
+ "NORP": "NRP",
115
+ "AGE": "AGE",
116
+ "ID": "ID",
117
+ "EMAIL": "EMAIL",
118
+ "PATIENT": "PERSON",
119
+ "STAFF": "PERSON",
120
+ "HOSP": "ORGANIZATION",
121
+ "PATORG": "ORGANIZATION",
122
+ "DATE": "DATE_TIME",
123
+ "TIME": "DATE_TIME",
124
+ "PHONE": "PHONE_NUMBER",
125
+ "HCW": "PERSON",
126
+ "HOSPITAL": "ORGANIZATION",
127
+ "FACILITY": "LOCATION",
128
+ },
129
+ "low_confidence_score_multiplier": 0.4,
130
+ "low_score_entity_names": ["ID"],
131
+ "labels_to_ignore": [
132
+ "CARDINAL",
133
+ "EVENT",
134
+ "LANGUAGE",
135
+ "LAW",
136
+ "MONEY",
137
+ "ORDINAL",
138
+ "PERCENT",
139
+ "PRODUCT",
140
+ "QUANTITY",
141
+ "WORK_OF_ART",
142
+ ],
143
+ },
144
  }
145
 
 
 
 
146
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
147
 
148
+ registry = RecognizerRegistry()
149
+ registry.load_predefined_recognizers(nlp_engine=nlp_engine)
150
+
151
  return nlp_engine, registry
152
 
153
 
 
165
  registry = RecognizerRegistry()
166
  registry.load_predefined_recognizers()
167
 
168
+ # there is no official Flair NlpEngine, hence we load it as an additional recognizer
169
+
170
  if not spacy.util.is_package("en_core_web_sm"):
171
  spacy.cli.download("en_core_web_sm")
172
  # Using a small spaCy model + a Flair NER model
 
183
  return nlp_engine, registry
184
 
185
 
186
+ def create_nlp_engine_with_azure_ai_language(ta_key: str, ta_endpoint: str):
187
  """
188
  Instantiate an NlpEngine with a TextAnalyticsWrapper and a small spaCy model.
189
  The TextAnalyticsWrapper would return results from calling Azure Text Analytics PII, the spaCy model
 
191
  :param ta_key: Azure Text Analytics key.
192
  :param ta_endpoint: Azure Text Analytics endpoint.
193
  """
194
+ from azure_ai_language_wrapper import AzureAIServiceWrapper
195
 
196
  if not ta_key or not ta_endpoint:
197
  raise RuntimeError("Please fill in the Text Analytics endpoint details")
 
199
  registry = RecognizerRegistry()
200
  registry.load_predefined_recognizers()
201
 
202
+ azure_ai_language_recognizer = AzureAIServiceWrapper(
203
+ ta_endpoint=ta_endpoint, ta_key=ta_key
204
+ )
205
  nlp_configuration = {
206
  "nlp_engine_name": "spacy",
207
  "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
 
209
 
210
  nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
211
 
212
+ registry.add_recognizer(azure_ai_language_recognizer)
213
  registry.remove_recognizer("SpacyRecognizer")
214
 
215
  return nlp_engine, registry
presidio_streamlit.py CHANGED
@@ -56,7 +56,8 @@ model_list = [
56
  "flair/ner-english-large",
57
  "HuggingFace/obi/deid_roberta_i2b2",
58
  "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
59
- "Azure Text Analytics PII",
 
60
  "Other",
61
  ]
62
  if not allow_other_models:
@@ -75,22 +76,22 @@ st_model_package = st_model.split("/")[0]
75
  # Remove package prefix (if needed)
76
  st_model = (
77
  st_model
78
- if st_model_package not in ("spaCy", "HuggingFace")
79
  else "/".join(st_model.split("/")[1:])
80
  )
81
 
82
  if st_model == "Other":
83
  st_model_package = st.sidebar.selectbox(
84
- "NER model OSS package", options=["spaCy", "Flair", "HuggingFace"]
85
  )
86
  st_model = st.sidebar.text_input(f"NER model name", value="")
87
 
88
- if st_model == "Azure Text Analytics PII":
89
  st_ta_key = st.sidebar.text_input(
90
- f"Text Analytics key", value=os.getenv("TA_KEY", ""), type="password"
91
  )
92
  st_ta_endpoint = st.sidebar.text_input(
93
- f"Text Analytics endpoint",
94
  value=os.getenv("TA_ENDPOINT", default=""),
95
  help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501
96
  )
@@ -124,23 +125,18 @@ open_ai_params = None
124
 
125
  logger.debug(f"st_operator: {st_operator}")
126
 
127
- if st_operator == "mask":
128
- st_number_of_chars = st.sidebar.number_input(
129
- "number of chars", value=st_number_of_chars, min_value=0, max_value=100
130
- )
131
- st_mask_char = st.sidebar.text_input(
132
- "Mask character", value=st_mask_char, max_chars=1
133
- )
134
- elif st_operator == "encrypt":
135
- st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
136
- elif st_operator == "synthesize":
137
  if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
138
  openai_api_type = "azure"
139
  st_openai_api_base = st.sidebar.text_input(
140
  "Azure OpenAI base URL",
141
  value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
142
  )
143
- st_deployment_name = st.sidebar.text_input(
 
144
  "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
145
  )
146
  st_openai_version = st.sidebar.text_input(
@@ -148,11 +144,13 @@ elif st_operator == "synthesize":
148
  value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
149
  )
150
  else:
151
- st_openai_version = openai_api_type = st_openai_api_base = None
152
- st_deployment_name = ""
 
 
153
  st_openai_key = st.sidebar.text_input(
154
  "OPENAI_KEY",
155
- value=os.getenv("OPENAI_KEY", default=""),
156
  help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
157
  type="password",
158
  )
@@ -161,12 +159,40 @@ elif st_operator == "synthesize":
161
  value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
162
  help="See more here: https://platform.openai.com/docs/models/",
163
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  open_ai_params = OpenAIParams(
166
  openai_key=st_openai_key,
167
  model=st_openai_model,
168
  api_base=st_openai_api_base,
169
- deployment_name=st_deployment_name,
170
  api_version=st_openai_version,
171
  api_type=openai_api_type,
172
  )
@@ -214,7 +240,8 @@ with st.expander("About this demo", expanded=False):
214
  \n\n[Code](https://aka.ms/presidio) |
215
  [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
216
  [Installation](https://microsoft.github.io/presidio/installation/) |
217
- [FAQ](https://microsoft.github.io/presidio/faq/) |"""
 
218
  )
219
 
220
  st.info(
 
56
  "flair/ner-english-large",
57
  "HuggingFace/obi/deid_roberta_i2b2",
58
  "HuggingFace/StanfordAIMI/stanford-deidentifier-base",
59
+ "stanza/en",
60
+ "Azure AI Language",
61
  "Other",
62
  ]
63
  if not allow_other_models:
 
76
  # Remove package prefix (if needed)
77
  st_model = (
78
  st_model
79
+ if st_model_package.lower() not in ("spacy", "stanza", "huggingface")
80
  else "/".join(st_model.split("/")[1:])
81
  )
82
 
83
  if st_model == "Other":
84
  st_model_package = st.sidebar.selectbox(
85
+ "NER model OSS package", options=["spaCy", "stanza", "Flair", "HuggingFace"]
86
  )
87
  st_model = st.sidebar.text_input(f"NER model name", value="")
88
 
89
+ if st_model == "Azure AI Language":
90
  st_ta_key = st.sidebar.text_input(
91
+ f"Azure AI Language key", value=os.getenv("TA_KEY", ""), type="password"
92
  )
93
  st_ta_endpoint = st.sidebar.text_input(
94
+ f"Azure AI Language endpoint",
95
  value=os.getenv("TA_ENDPOINT", default=""),
96
  help="For more info: https://learn.microsoft.com/en-us/azure/cognitive-services/language-service/personally-identifiable-information/overview", # noqa: E501
97
  )
 
125
 
126
  logger.debug(f"st_operator: {st_operator}")
127
 
128
+
129
+ def set_up_openai_synthesis():
130
+ """Set up the OpenAI API key and model for text synthesis."""
131
+
 
 
 
 
 
 
132
  if os.getenv("OPENAI_TYPE", default="openai") == "Azure":
133
  openai_api_type = "azure"
134
  st_openai_api_base = st.sidebar.text_input(
135
  "Azure OpenAI base URL",
136
  value=os.getenv("AZURE_OPENAI_ENDPOINT", default=""),
137
  )
138
+ openai_key = os.getenv("AZURE_OPENAI_KEY", default="")
139
+ st_deployment_id = st.sidebar.text_input(
140
  "Deployment name", value=os.getenv("AZURE_OPENAI_DEPLOYMENT", default="")
141
  )
142
  st_openai_version = st.sidebar.text_input(
 
144
  value=os.getenv("OPENAI_API_VERSION", default="2023-05-15"),
145
  )
146
  else:
147
+ openai_api_type = "openai"
148
+ st_openai_version = st_openai_api_base = None
149
+ st_deployment_id = ""
150
+ openai_key = os.getenv("OPENAI_KEY", default="")
151
  st_openai_key = st.sidebar.text_input(
152
  "OPENAI_KEY",
153
+ value=openai_key,
154
  help="See https://help.openai.com/en/articles/4936850-where-do-i-find-my-secret-api-key for more info.",
155
  type="password",
156
  )
 
159
  value=os.getenv("OPENAI_MODEL", default="text-davinci-003"),
160
  help="See more here: https://platform.openai.com/docs/models/",
161
  )
162
+ return (
163
+ openai_api_type,
164
+ st_openai_api_base,
165
+ st_deployment_id,
166
+ st_openai_version,
167
+ st_openai_key,
168
+ st_openai_model,
169
+ )
170
+
171
+
172
+ if st_operator == "mask":
173
+ st_number_of_chars = st.sidebar.number_input(
174
+ "number of chars", value=st_number_of_chars, min_value=0, max_value=100
175
+ )
176
+ st_mask_char = st.sidebar.text_input(
177
+ "Mask character", value=st_mask_char, max_chars=1
178
+ )
179
+ elif st_operator == "encrypt":
180
+ st_encrypt_key = st.sidebar.text_input("AES key", value=st_encrypt_key)
181
+ elif st_operator == "synthesize":
182
+ (
183
+ openai_api_type,
184
+ st_openai_api_base,
185
+ st_deployment_id,
186
+ st_openai_version,
187
+ st_openai_key,
188
+ st_openai_model,
189
+ ) = set_up_openai_synthesis()
190
 
191
  open_ai_params = OpenAIParams(
192
  openai_key=st_openai_key,
193
  model=st_openai_model,
194
  api_base=st_openai_api_base,
195
+ deployment_id=st_deployment_id,
196
  api_version=st_openai_version,
197
  api_type=openai_api_type,
198
  )
 
240
  \n\n[Code](https://aka.ms/presidio) |
241
  [Tutorial](https://microsoft.github.io/presidio/tutorial/) |
242
  [Installation](https://microsoft.github.io/presidio/installation/) |
243
+ [FAQ](https://microsoft.github.io/presidio/faq/) |
244
+ [Feedback](https://forms.office.com/r/9ufyYjfDaY) |"""
245
  )
246
 
247
  st.info(
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
- presidio-analyzer
 
2
  presidio-anonymizer
3
  streamlit
4
  streamlit-tags
@@ -6,8 +7,6 @@ pandas
6
  python-dotenv
7
  st-annotated-text
8
  torch
9
- transformers
10
  flair
11
  openai
12
- spacy
13
  azure-ai-textanalytics
 
1
+ presidio-analyzer[transformers]
2
+ presidio-analyzer[stanza]
3
  presidio-anonymizer
4
  streamlit
5
  streamlit-tags
 
7
  python-dotenv
8
  st-annotated-text
9
  torch
 
10
  flair
11
  openai
 
12
  azure-ai-textanalytics
test_streamlit.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from presidio_helpers import analyzer_engine, analyze, anonymize
2
+
3
+
4
+ def test_streamlit_logic():
5
+ st_model = "en" # st_model = "StanfordAIMI/stanford-deidentifier-base"
6
+ st_model_package = "stanza" ##st_model_package = "HuggingFace"
7
+ st_ta_key = None
8
+ st_ta_endpoint = None
9
+
10
+ analyzer_params = (st_model_package, st_model, st_ta_key, st_ta_endpoint)
11
+
12
+ # Read default text
13
+ with open("demo_text.txt") as f:
14
+ demo_text = f.readlines()
15
+
16
+ st_text = "".join(demo_text)
17
+
18
+ # instantiate and cache AnalyzerEngine
19
+ analyzer_engine(*analyzer_params)
20
+
21
+ # Analyze
22
+ st_analyze_results = analyze(
23
+ *analyzer_params,
24
+ text=st_text,
25
+ entities="All",
26
+ language="en",
27
+ score_threshold=0.35,
28
+ return_decision_process=True,
29
+ allow_list=[],
30
+ deny_list=[],
31
+ )
32
+
33
+ # Anonymize
34
+ st_anonymize_results = anonymize(
35
+ text=st_text,
36
+ operator="replace",
37
+ mask_char=None,
38
+ number_of_chars=None,
39
+ encrypt_key=None,
40
+ analyze_results=st_analyze_results,
41
+ )
42
+
43
+ assert st_anonymize_results.text != ""