File size: 2,899 Bytes
646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e 2b591f4 646bd9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gensim
import re
from concrete.ml.deployment import FHEModelClient, FHEModelServer
from pathlib import Path
from concrete.ml.common.serialization.loaders import load
base_dir = Path(__file__).parent
class FHEAnonymizer:
def __init__(self, punctuation_list=".,!?:;"):
self.embeddings_model = gensim.models.FastText.load(
str(base_dir / "embedded_model.model")
)
self.punctuation_list = punctuation_list
with open(base_dir / "cml_xgboost.model", "r") as model_file:
self.fhe_ner_detection = load(file=model_file)
path_to_model = (base_dir / "deployment").resolve()
self.client = FHEModelClient(path_to_model)
self.server = FHEModelServer(path_to_model)
self.client.generate_private_and_evaluation_keys()
self.evaluation_key = self.client.get_serialized_evaluation_keys()
def fhe_inference(self, x):
enc_x = self.client.quantize_encrypt_serialize(x)
enc_y = self.server.run(enc_x, self.evaluation_key)
y = self.client.deserialize_decrypt_dequantize(enc_y)
return y
def __call__(self, text: str):
text = self.preprocess_sentences(text)
identified_words_with_prob = [] # tuples of (word, probability)
new_text = []
for word in text.split():
# Prediction for each word
x = self.embeddings_model.wv[word][None]
prediction_proba = self.fhe_ner_detection.predict_proba(x)
# prediction = self.fhe_inference(x).argmax(1)[0]
# print(word, prediction)
probability = prediction_proba[0][1]
prediction = probability >= 0.5
if prediction == 1:
identified_words_with_prob.append((word, probability))
new_text.append("<REMOVED>")
else:
new_text.append(word)
# Joining the modified text
modified_text = " ".join(new_text)
return modified_text, identified_words_with_prob
def preprocess_sentences(self, sentence, verbose=False):
"""Preprocess the sentence."""
sentence = re.sub(r"\n+", " ", sentence)
if verbose:
print(sentence)
sentence = re.sub(" +", " ", sentence)
if verbose:
print(sentence)
sentence = re.sub(r"'s\b", " s", sentence)
if verbose:
print(sentence)
sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
if verbose:
print(sentence)
pattern = r"(?<!\w)[{}]|[{}](?!\w)".format(
re.escape(self.punctuation_list), re.escape(self.punctuation_list)
)
sentence = re.sub(pattern, "", sentence)
if verbose:
print(sentence)
sentence = re.sub(r"\s([,.!?;:])", r"\1", sentence)
if verbose:
print(sentence)
return sentence
|