Spaces:
Sleeping
Sleeping
File size: 4,870 Bytes
b494f67 79d722e b494f67 79d722e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
from typing import Tuple
import logging
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
from transformers_class import TransformerRecognizer
logger = logging.getLogger("presidio-streamlit")
def create_nlp_engine_with_spacy(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a spaCy model
:param model_path: spaCy model path.
"""
if not spacy.util.is_package(model_path):
spacy.cli.download(model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
# registry.load_predefined_recognizers()
registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"])
registry.add_recognizers_from_yaml("recognizers.yaml")
return nlp_engine, registry
def create_nlp_engine_with_transformers(
model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
"""
Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
The TransformersRecognizer would return results from Transformers models, the spaCy model
would return NlpArtifacts such as POS and lemmas.
:param model_path: HuggingFace model path.
"""
# if not spacy.util.is_package("en_core_web_sm"):
# spacy.cli.download("en_core_web_sm")
# # Using a small spaCy model + a HF NER model
# transformers_recognizer = TransformersRecognizer(model_path=model_path)
#
# if model_path == "StanfordAIMI/stanford-deidentifier-base":
# transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
# elif model_path == "obi/deid_roberta_i2b2":
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
# else:
# print(f"Warning: Model has no configuration, loading default.")
# transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
# Use small spaCy model, no need for both spacy and HF models
# The transformers model is used here as a recognizer, not as an NlpEngine
if not spacy.util.is_package(model_path):
spacy.cli.download(model_path)
nlp_configuration = {
"nlp_engine_name": "spacy",
"models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
}
nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
registry = RecognizerRegistry()
registry = load_predefined_recognizers(registry)
mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'}
model_name = "AliaeAI/camembert_anonymizer_production_v2" # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production"
transformers_recognizer = TransformerRecognizer(model_name, mapping_labels)
registry.add_recognizer(transformers_recognizer)
registry.remove_recognizer("SpacyRecognizer")
return nlp_engine, registry
from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer
import phonenumbers
def load_predefined_recognizers(registry, lang='fr'):
# phone number
phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone'])
registry.add_recognizer(phone_recognizer_fr)
# email
email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"])
registry.add_recognizer(email_recognizer_fr)
# credit card
creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"])
registry.add_recognizer(creditcard_recognizer_fr)
# crypto
crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"])
registry.add_recognizer(crypto_recognizer_fr)
# date time
date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"])
registry.add_recognizer(date_recognizer_fr)
# ip address
ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"])
registry.add_recognizer(ip_recognizer_fr)
# iban
iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"])
registry.add_recognizer(iban_recognizer_fr)
# URL
url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"])
registry.add_recognizer(url_recognizer_fr)
# load from yaml
registry.add_recognizers_from_yaml("recognizers.yaml")
return registry
|