Spaces:
Sleeping
Sleeping
add new transformers model for french + update entities
Browse files- README.md +3 -3
- __pycache__/presidio_helpers.cpython-310.pyc +0 -0
- __pycache__/presidio_nlp_engine_config.cpython-310.pyc +0 -0
- __pycache__/transformers_class.cpython-310.pyc +0 -0
- app.py +10 -4
- presidio_helpers.py +2 -1
- presidio_nlp_engine_config.py +94 -105
- recognizers.yaml +11 -11
- requirements.txt +0 -2
- transformers_class.py +52 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title: Anonymizer
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
sdk: streamlit
|
@@ -9,4 +9,4 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Aliae Anonymizer
|
3 |
+
emoji: 😻
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
sdk: streamlit
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/presidio_helpers.cpython-310.pyc
CHANGED
Binary files a/__pycache__/presidio_helpers.cpython-310.pyc and b/__pycache__/presidio_helpers.cpython-310.pyc differ
|
|
__pycache__/presidio_nlp_engine_config.cpython-310.pyc
CHANGED
Binary files a/__pycache__/presidio_nlp_engine_config.cpython-310.pyc and b/__pycache__/presidio_nlp_engine_config.cpython-310.pyc differ
|
|
__pycache__/transformers_class.cpython-310.pyc
ADDED
Binary file (1.81 kB). View file
|
|
app.py
CHANGED
@@ -56,7 +56,7 @@ st_ta_key = st_ta_endpoint = ""
|
|
56 |
|
57 |
model_list = [
|
58 |
"spaCy/en_core_web_lg",
|
59 |
-
"spaCy/
|
60 |
]
|
61 |
# "flair/ner-english-large",
|
62 |
#
|
@@ -78,7 +78,7 @@ lang = st.sidebar.selectbox(
|
|
78 |
|
79 |
# Extract model package.
|
80 |
# st_model_package = st_model.split("/")[0]
|
81 |
-
|
82 |
|
83 |
# # Remove package prefix (if needed)
|
84 |
# st_model = (
|
@@ -87,8 +87,14 @@ st_model_package = 'spaCy'
|
|
87 |
# else "/".join(st_model.split("/")[1:])
|
88 |
# )
|
89 |
st_model = 'en_core_web_lg'
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
|
93 |
# if st_model == "Other":
|
94 |
# st_model_package = st.sidebar.selectbox(
|
|
|
56 |
|
57 |
model_list = [
|
58 |
"spaCy/en_core_web_lg",
|
59 |
+
"spaCy/fr_core_news_lg",
|
60 |
]
|
61 |
# "flair/ner-english-large",
|
62 |
#
|
|
|
78 |
|
79 |
# Extract model package.
|
80 |
# st_model_package = st_model.split("/")[0]
|
81 |
+
|
82 |
|
83 |
# # Remove package prefix (if needed)
|
84 |
# st_model = (
|
|
|
87 |
# else "/".join(st_model.split("/")[1:])
|
88 |
# )
|
89 |
st_model = 'en_core_web_lg'
|
90 |
+
st_model_package = "spaCy"
|
91 |
+
|
92 |
+
if lang =='en':
|
93 |
+
st_model_package = "spaCy"
|
94 |
+
st_model = 'en_core_web_lg'
|
95 |
+
elif lang == 'fr' :
|
96 |
+
st_model_package = "HuggingFace"
|
97 |
+
st_model = 'fr_core_news_lg'
|
98 |
|
99 |
# if st_model == "Other":
|
100 |
# st_model_package = st.sidebar.selectbox(
|
presidio_helpers.py
CHANGED
@@ -24,7 +24,7 @@ from presidio_anonymizer.entities import OperatorConfig
|
|
24 |
from presidio_nlp_engine_config import (
|
25 |
create_nlp_engine_with_spacy,
|
26 |
# create_nlp_engine_with_flair,
|
27 |
-
|
28 |
# create_nlp_engine_with_azure_text_analytics,
|
29 |
)
|
30 |
|
@@ -99,6 +99,7 @@ def get_supported_entities(
|
|
99 |
# model_family, model_path, ta_key, ta_endpoint
|
100 |
# ).get_supported_entities() + ["GENERIC_PII"]
|
101 |
return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
|
|
|
102 |
|
103 |
|
104 |
@st.cache_data
|
|
|
24 |
from presidio_nlp_engine_config import (
|
25 |
create_nlp_engine_with_spacy,
|
26 |
# create_nlp_engine_with_flair,
|
27 |
+
create_nlp_engine_with_transformers,
|
28 |
# create_nlp_engine_with_azure_text_analytics,
|
29 |
)
|
30 |
|
|
|
99 |
# model_family, model_path, ta_key, ta_endpoint
|
100 |
# ).get_supported_entities() + ["GENERIC_PII"]
|
101 |
return ["PERSON", "IBAN_CODE", "PHONE_NUMBER", "CREDIT_CARD", "CRYPTO", "DATE_TIME", "EMAIL_ADDRESS", "IP_ADDRESS", "NRP", "LOCATION", "URL", "FRENCH_SSN", "FRENCH_PASS", "FRENCH_NID"]
|
102 |
+
#
|
103 |
|
104 |
|
105 |
@st.cache_data
|
presidio_nlp_engine_config.py
CHANGED
@@ -3,6 +3,7 @@ import logging
|
|
3 |
import spacy
|
4 |
from presidio_analyzer import RecognizerRegistry
|
5 |
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
|
|
|
6 |
|
7 |
logger = logging.getLogger("presidio-streamlit")
|
8 |
|
@@ -34,108 +35,96 @@ def create_nlp_engine_with_spacy(
|
|
34 |
return nlp_engine, registry
|
35 |
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
#
|
51 |
-
# )
|
52 |
-
#
|
53 |
-
#
|
54 |
-
#
|
55 |
-
#
|
56 |
-
#
|
57 |
-
#
|
58 |
-
#
|
59 |
-
#
|
60 |
-
#
|
61 |
-
#
|
62 |
-
|
63 |
-
#
|
64 |
-
#
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
#
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
#
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
#
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
#
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
#
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
#
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
#
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
#
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
#
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
# ta_recognizer = TextAnalyticsWrapper(ta_endpoint=ta_endpoint, ta_key=ta_key)
|
131 |
-
# nlp_configuration = {
|
132 |
-
# "nlp_engine_name": "spacy",
|
133 |
-
# "models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
|
134 |
-
# }
|
135 |
-
#
|
136 |
-
# nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
137 |
-
#
|
138 |
-
# registry.add_recognizer(ta_recognizer)
|
139 |
-
# registry.remove_recognizer("SpacyRecognizer")
|
140 |
-
#
|
141 |
-
# return nlp_engine, registry
|
|
|
3 |
import spacy
|
4 |
from presidio_analyzer import RecognizerRegistry
|
5 |
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
|
6 |
+
from transformers_class import TransformerRecognizer
|
7 |
|
8 |
logger = logging.getLogger("presidio-streamlit")
|
9 |
|
|
|
35 |
return nlp_engine, registry
|
36 |
|
37 |
|
38 |
+
def create_nlp_engine_with_transformers(
|
39 |
+
model_path: str,
|
40 |
+
) -> Tuple[NlpEngine, RecognizerRegistry]:
|
41 |
+
"""
|
42 |
+
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
|
43 |
+
The TransformersRecognizer would return results from Transformers models, the spaCy model
|
44 |
+
would return NlpArtifacts such as POS and lemmas.
|
45 |
+
:param model_path: HuggingFace model path.
|
46 |
+
"""
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
# if not spacy.util.is_package("en_core_web_sm"):
|
52 |
+
# spacy.cli.download("en_core_web_sm")
|
53 |
+
# # Using a small spaCy model + a HF NER model
|
54 |
+
# transformers_recognizer = TransformersRecognizer(model_path=model_path)
|
55 |
+
#
|
56 |
+
# if model_path == "StanfordAIMI/stanford-deidentifier-base":
|
57 |
+
# transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
|
58 |
+
# elif model_path == "obi/deid_roberta_i2b2":
|
59 |
+
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
|
60 |
+
# else:
|
61 |
+
# print(f"Warning: Model has no configuration, loading default.")
|
62 |
+
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
|
63 |
+
|
64 |
+
# Use small spaCy model, no need for both spacy and HF models
|
65 |
+
# The transformers model is used here as a recognizer, not as an NlpEngine
|
66 |
+
if not spacy.util.is_package(model_path):
|
67 |
+
spacy.cli.download(model_path)
|
68 |
+
|
69 |
+
nlp_configuration = {
|
70 |
+
"nlp_engine_name": "spacy",
|
71 |
+
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
|
72 |
+
}
|
73 |
+
|
74 |
+
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
|
75 |
+
registry = RecognizerRegistry()
|
76 |
+
registry = load_predefined_recognizers(registry)
|
77 |
+
|
78 |
+
mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'}
|
79 |
+
model_name = "AliaeAI/camembert_anonymizer_production_v2" # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production"
|
80 |
+
transformers_recognizer = TransformerRecognizer(model_name, mapping_labels)
|
81 |
+
|
82 |
+
registry.add_recognizer(transformers_recognizer)
|
83 |
+
registry.remove_recognizer("SpacyRecognizer")
|
84 |
+
|
85 |
+
|
86 |
+
|
87 |
+
return nlp_engine, registry
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer
|
92 |
+
import phonenumbers
|
93 |
+
|
94 |
+
def load_predefined_recognizers(registry, lang='fr'):
|
95 |
+
# phone number
|
96 |
+
phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone'])
|
97 |
+
registry.add_recognizer(phone_recognizer_fr)
|
98 |
+
|
99 |
+
# email
|
100 |
+
email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"])
|
101 |
+
registry.add_recognizer(email_recognizer_fr)
|
102 |
+
|
103 |
+
# credit card
|
104 |
+
creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"])
|
105 |
+
registry.add_recognizer(creditcard_recognizer_fr)
|
106 |
+
|
107 |
+
# crypto
|
108 |
+
crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"])
|
109 |
+
registry.add_recognizer(crypto_recognizer_fr)
|
110 |
+
|
111 |
+
# date time
|
112 |
+
date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"])
|
113 |
+
registry.add_recognizer(date_recognizer_fr)
|
114 |
+
|
115 |
+
# ip address
|
116 |
+
ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"])
|
117 |
+
registry.add_recognizer(ip_recognizer_fr)
|
118 |
+
|
119 |
+
# iban
|
120 |
+
iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"])
|
121 |
+
registry.add_recognizer(iban_recognizer_fr)
|
122 |
+
|
123 |
+
# URL
|
124 |
+
url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"])
|
125 |
+
registry.add_recognizer(url_recognizer_fr)
|
126 |
+
|
127 |
+
# load from yaml
|
128 |
+
registry.add_recognizers_from_yaml("recognizers.yaml")
|
129 |
+
|
130 |
+
return registry
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
recognizers.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
recognizers:
|
2 |
-
-
|
3 |
-
name: "FRENCH_NID"
|
4 |
-
supported_language: "fr"
|
5 |
-
patterns:
|
6 |
-
-
|
7 |
-
name: "FRENCH_NID"
|
8 |
-
regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
|
9 |
-
score: 0.5
|
10 |
-
context:
|
11 |
-
- national
|
12 |
-
supported_entity: "FRENCH_NID"
|
13 |
-
|
14 |
name: "FRENCH_NID"
|
15 |
supported_language: "en"
|
|
|
1 |
recognizers:
|
2 |
+
# -
|
3 |
+
# name: "FRENCH_NID"
|
4 |
+
# supported_language: "fr"
|
5 |
+
# patterns:
|
6 |
+
# -
|
7 |
+
# name: "FRENCH_NID"
|
8 |
+
# regex: "[0-9]{12}|([A-Z]|[0-9]){9}"
|
9 |
+
# score: 0.5
|
10 |
+
# context:
|
11 |
+
# - national
|
12 |
+
# supported_entity: "FRENCH_NID"
|
13 |
-
|
14 |
name: "FRENCH_NID"
|
15 |
supported_language: "en"
|
requirements.txt
CHANGED
@@ -7,7 +7,5 @@ python-dotenv
|
|
7 |
st-annotated-text
|
8 |
torch
|
9 |
transformers
|
10 |
-
flair
|
11 |
-
openai
|
12 |
spacy
|
13 |
azure-ai-textanalytics
|
|
|
7 |
st-annotated-text
|
8 |
torch
|
9 |
transformers
|
|
|
|
|
10 |
spacy
|
11 |
azure-ai-textanalytics
|
transformers_class.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from presidio_analyzer import (
|
3 |
+
RecognizerResult,
|
4 |
+
EntityRecognizer,
|
5 |
+
AnalysisExplanation,
|
6 |
+
)
|
7 |
+
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts
|
8 |
+
|
9 |
+
class TransformerRecognizer(EntityRecognizer):
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
model_id_or_path,
|
13 |
+
mapping_labels,
|
14 |
+
aggregation_strategy="simple",
|
15 |
+
supported_language="fr",
|
16 |
+
ignore_labels=["O", "MISC"],
|
17 |
+
):
|
18 |
+
# inits transformers pipeline for given mode or path
|
19 |
+
self.pipeline = pipeline(
|
20 |
+
"token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
|
21 |
+
)
|
22 |
+
# map labels to presidio labels
|
23 |
+
self.label2presidio = mapping_labels
|
24 |
+
|
25 |
+
# passes entities from model into parent class
|
26 |
+
super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)
|
27 |
+
|
28 |
+
def load(self) -> None:
|
29 |
+
"""No loading is required."""
|
30 |
+
pass
|
31 |
+
|
32 |
+
def analyze(
|
33 |
+
self, text: str, entities = None, nlp_artifacts: NlpArtifacts = None
|
34 |
+
):
|
35 |
+
"""
|
36 |
+
Extracts entities using Transformers pipeline
|
37 |
+
"""
|
38 |
+
results = []
|
39 |
+
|
40 |
+
predicted_entities = self.pipeline(text)
|
41 |
+
if len(predicted_entities) > 0:
|
42 |
+
for e in predicted_entities:
|
43 |
+
if(e['entity_group'] not in self.label2presidio):
|
44 |
+
continue
|
45 |
+
converted_entity = self.label2presidio[e["entity_group"]]
|
46 |
+
if converted_entity in entities or entities is None:
|
47 |
+
results.append(
|
48 |
+
RecognizerResult(
|
49 |
+
entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
|
50 |
+
)
|
51 |
+
)
|
52 |
+
return results
|