File size: 4,870 Bytes
b494f67
 
 
 
 
79d722e
b494f67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79d722e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from typing import Tuple
import logging
import spacy
from presidio_analyzer import RecognizerRegistry
from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider
from transformers_class import TransformerRecognizer

logger = logging.getLogger("presidio-streamlit")


def create_nlp_engine_with_spacy(
    model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
    """
    Instantiate an NlpEngine with a spaCy model
    :param model_path: spaCy model path.
    """
    if not spacy.util.is_package(model_path):
        spacy.cli.download(model_path)

    nlp_configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
    }

    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()

    registry = RecognizerRegistry()
    # registry.load_predefined_recognizers()
    registry.load_predefined_recognizers(nlp_engine=nlp_engine, languages=["fr", "en"])
    registry.add_recognizers_from_yaml("recognizers.yaml")



    return nlp_engine, registry


def create_nlp_engine_with_transformers(
    model_path: str,
) -> Tuple[NlpEngine, RecognizerRegistry]:
    """
    Instantiate an NlpEngine with a TransformersRecognizer and a small spaCy model.
    The TransformersRecognizer would return results from Transformers models, the spaCy model
    would return NlpArtifacts such as POS and lemmas.
    :param model_path: HuggingFace model path.
    """




    # if not spacy.util.is_package("en_core_web_sm"):
    #     spacy.cli.download("en_core_web_sm")
    # # Using a small spaCy model + a HF NER model
    # transformers_recognizer = TransformersRecognizer(model_path=model_path)
    #
    # if model_path == "StanfordAIMI/stanford-deidentifier-base":
    #     transformers_recognizer.load_transformer(**STANFORD_COFIGURATION)
    # elif model_path == "obi/deid_roberta_i2b2":
    #     transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)
    # else:
    #     print(f"Warning: Model has no configuration, loading default.")
    #     transformers_recognizer.load_transformer(**BERT_DEID_CONFIGURATION)

    # Use small spaCy model, no need for both spacy and HF models
    # The transformers model is used here as a recognizer, not as an NlpEngine
    if not spacy.util.is_package(model_path):
        spacy.cli.download(model_path)

    nlp_configuration = {
        "nlp_engine_name": "spacy",
        "models": [{"lang_code": model_path.split('_')[0], "model_name": model_path}],
    }

    nlp_engine = NlpEngineProvider(nlp_configuration=nlp_configuration).create_engine()
    registry = RecognizerRegistry()
    registry = load_predefined_recognizers(registry)

    mapping_labels = {"PER": "PERSON", 'LOC': 'LOCATION'}
    model_name = "AliaeAI/camembert_anonymizer_production_v2"  # "Jean-Baptiste/camembert-ner" , "AliaeAI/camembert_anonymizer_production"
    transformers_recognizer = TransformerRecognizer(model_name, mapping_labels)

    registry.add_recognizer(transformers_recognizer)
    registry.remove_recognizer("SpacyRecognizer")



    return nlp_engine, registry



from presidio_analyzer.predefined_recognizers import PhoneRecognizer, EmailRecognizer, CreditCardRecognizer, CryptoRecognizer, DateRecognizer, IpRecognizer, IbanRecognizer, UrlRecognizer
import phonenumbers

def load_predefined_recognizers(registry, lang='fr'):
    # phone number
    phone_recognizer_fr = PhoneRecognizer(supported_language=lang, supported_regions=phonenumbers.SUPPORTED_REGIONS,context=['téléphone'])
    registry.add_recognizer(phone_recognizer_fr)

    # email
    email_recognizer_fr = EmailRecognizer(supported_language=lang, context=["email", "mail", "e-mail"])
    registry.add_recognizer(email_recognizer_fr)

    # credit card
    creditcard_recognizer_fr = CreditCardRecognizer(supported_language=lang,context=["crédit", "carte", "carte de crédit"])
    registry.add_recognizer(creditcard_recognizer_fr)

    # crypto
    crypto_recognizer_fr = CryptoRecognizer(supported_language=lang, context=["crypto"])
    registry.add_recognizer(crypto_recognizer_fr)

    # date time
    date_recognizer_fr = DateRecognizer(supported_language=lang, context=["mois", "date", "jour", "année"])
    registry.add_recognizer(date_recognizer_fr)

    # ip address
    ip_recognizer_fr = IpRecognizer(supported_language=lang, context=["IP", "ip"])
    registry.add_recognizer(ip_recognizer_fr)

    # iban
    iban_recognizer_fr = IbanRecognizer(supported_language=lang, context = ["IBAN", "iban", "bancaire", "compte"])
    registry.add_recognizer(iban_recognizer_fr)

    # URL
    url_recognizer_fr = UrlRecognizer(supported_language=lang, context = ["site", "web"])
    registry.add_recognizer(url_recognizer_fr)

    # load from yaml
    registry.add_recognizers_from_yaml("recognizers.yaml")

    return registry