Zamanonymize3

Sleeping

File size: 2,741 Bytes

646bd9e
 
 
 
 
df6182e
 
646bd9e
 
 
2b591f4
646bd9e
 
 
2b591f4
628fe8f
2b591f4
646bd9e
628fe8f
646bd9e
 
df6182e
 
 
646bd9e
 
 
 
 
 
 
 
 
 
 
 
 
df6182e
 
 
 
 
 
 
 
 
 
 
 
646bd9e
 
df6182e
628fe8f
 
2b591f4
646bd9e
df6182e
 
 
 
 
 
 
 
646bd9e
df6182e

import gensim
import re
from concrete.ml.deployment import FHEModelClient, FHEModelServer
from pathlib import Path
from concrete.ml.common.serialization.loaders import load
import uuid
import json

base_dir = Path(__file__).parent


class FHEAnonymizer:
    def __init__(self, punctuation_list=".,!?:;"):

        self.embeddings_model = gensim.models.FastText.load(
            str(base_dir / "models/without_pronoun_embedded_model.model")
        )
        self.punctuation_list = punctuation_list
        with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
            self.fhe_ner_detection = load(file=model_file)

        with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
            self.uuid_map = json.load(file)

        path_to_model = (base_dir / "deployment").resolve()
        self.client = FHEModelClient(path_to_model)
        self.server = FHEModelServer(path_to_model)
        self.client.generate_private_and_evaluation_keys()
        self.evaluation_key = self.client.get_serialized_evaluation_keys()

    def fhe_inference(self, x):
        enc_x = self.client.quantize_encrypt_serialize(x)
        enc_y = self.server.run(enc_x, self.evaluation_key)
        y = self.client.deserialize_decrypt_dequantize(enc_y)
        return y

    def __call__(self, text: str):
        # Pattern to identify words and non-words (including punctuation, spaces, etc.)
        token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
        tokens = re.findall(token_pattern, text)
        identified_words_with_prob = []
        processed_tokens = []

        print(tokens)
        for token in tokens:
            # Directly append non-word tokens or whitespace to processed_tokens
            if not token.strip() or not re.match(r"\w+", token):
                processed_tokens.append(token)
                continue

            # Prediction for each word
            x = self.embeddings_model.wv[token][None]
            # prediction_proba = self.fhe_ner_detection.predict_proba(x)
            prediction_proba = self.fhe_inference(x)
            probability = prediction_proba[0][1]

            if probability >= 0.5:
                identified_words_with_prob.append((token, probability))
                # Use the existing UUID if available, otherwise generate a new one
                tmp_uuid = self.uuid_map.get(token, str(uuid.uuid4())[:8])
                processed_tokens.append(tmp_uuid)
                self.uuid_map[token] = tmp_uuid
            else:
                processed_tokens.append(token)

        # Reconstruct the sentence
        reconstructed_sentence = ''.join(processed_tokens)
        return reconstructed_sentence, identified_words_with_prob