jfrery-zama
commited on
Commit
β’
1dfccc3
1
Parent(s):
7552fa2
update representation with roberta + new fast model
Browse files- deployment/client.zip +2 -2
- deployment/server.zip +2 -2
- deployment/versions.json +1 -1
- fhe_anonymizer.py +8 -4
- models/cml_xgboost.model +2 -2
- utils_demo.py +21 -23
deployment/client.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81c8de4328853bc4b3df668ea1a174b6ed4d9d086c1a2cf544e3db639ac43b92
|
3 |
+
size 30438
|
deployment/server.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:52100230351a158351fdede68d1bca31e1473e1f9f8abe6a71a0a95c7191b18e
|
3 |
+
size 6333
|
deployment/versions.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"concrete-python": "2.5.1", "concrete-ml": "1.5.0-rc0", "python": "3.
|
|
|
1 |
+
{"concrete-python": "2.5.1", "concrete-ml": "1.5.0-rc0", "python": "3.10.12"}
|
fhe_anonymizer.py
CHANGED
@@ -5,6 +5,8 @@ from pathlib import Path
|
|
5 |
from concrete.ml.common.serialization.loaders import load
|
6 |
import uuid
|
7 |
import json
|
|
|
|
|
8 |
|
9 |
base_dir = Path(__file__).parent
|
10 |
|
@@ -12,9 +14,10 @@ base_dir = Path(__file__).parent
|
|
12 |
class FHEAnonymizer:
|
13 |
def __init__(self, punctuation_list=".,!?:;"):
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
)
|
|
|
18 |
self.punctuation_list = punctuation_list
|
19 |
with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
|
20 |
self.fhe_ner_detection = load(file=model_file)
|
@@ -49,7 +52,8 @@ class FHEAnonymizer:
|
|
49 |
continue
|
50 |
|
51 |
# Prediction for each word
|
52 |
-
x = self.embeddings_model.
|
|
|
53 |
# prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
54 |
prediction_proba = self.fhe_inference(x)
|
55 |
probability = prediction_proba[0][1]
|
|
|
5 |
from concrete.ml.common.serialization.loaders import load
|
6 |
import uuid
|
7 |
import json
|
8 |
+
from transformers import AutoTokenizer, AutoModel
|
9 |
+
from utils_demo import get_batch_text_representation
|
10 |
|
11 |
base_dir = Path(__file__).parent
|
12 |
|
|
|
14 |
class FHEAnonymizer:
|
15 |
def __init__(self, punctuation_list=".,!?:;"):
|
16 |
|
17 |
+
# Load tokenizer and model, move model to the selected device
|
18 |
+
self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
19 |
+
self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
20 |
+
|
21 |
self.punctuation_list = punctuation_list
|
22 |
with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
|
23 |
self.fhe_ner_detection = load(file=model_file)
|
|
|
52 |
continue
|
53 |
|
54 |
# Prediction for each word
|
55 |
+
x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
|
56 |
+
|
57 |
# prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
58 |
prediction_proba = self.fhe_inference(x)
|
59 |
probability = prediction_proba[0][1]
|
models/cml_xgboost.model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ea897cbdde4c99e439b788615a664083cfd57c0b3259407c0830a70e5fb45b05
|
3 |
+
size 82842
|
utils_demo.py
CHANGED
@@ -1,24 +1,22 @@
|
|
1 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
def process_tokens(tokens, inverse_uuid_map=None, uuid_map=None, embeddings_model=None, fhe_ner_detection=None, client=None):
|
4 |
-
"""Processes tokens based on the provided parameters for either deanonymizing, anonymizing or default processing."""
|
5 |
-
processed_tokens = []
|
6 |
-
for token in tokens:
|
7 |
-
if not token.strip() or not re.match(r"\w+", token): # Directly append non-word tokens or whitespace
|
8 |
-
processed_tokens.append(token)
|
9 |
-
continue
|
10 |
-
if inverse_uuid_map is not None: # For deanonymizing response
|
11 |
-
processed_tokens.append(inverse_uuid_map.get(token, token))
|
12 |
-
elif uuid_map is not None and embeddings_model is not None and fhe_ner_detection is not None and client is not None: # For FHEAnonymizer call
|
13 |
-
x = embeddings_model.wv[token][None]
|
14 |
-
prediction_proba = fhe_ner_detection.predict_proba(x)
|
15 |
-
probability = prediction_proba[0][1]
|
16 |
-
if probability >= 0.5:
|
17 |
-
tmp_uuid = uuid_map.get(token, str(uuid.uuid4())[:8])
|
18 |
-
processed_tokens.append(tmp_uuid)
|
19 |
-
uuid_map[token] = tmp_uuid
|
20 |
-
else:
|
21 |
-
processed_tokens.append(token)
|
22 |
-
else:
|
23 |
-
processed_tokens.append(token)
|
24 |
-
return ''.join(processed_tokens)
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import random
|
4 |
+
|
5 |
+
def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
|
6 |
+
"""
|
7 |
+
Get mean-pooled representations of given texts in batches.
|
8 |
+
"""
|
9 |
+
mean_pooled_batch = []
|
10 |
+
for i in range(0, len(texts), batch_size):
|
11 |
+
batch_texts = texts[i:i+batch_size]
|
12 |
+
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
|
13 |
+
with torch.no_grad():
|
14 |
+
outputs = model(**inputs, output_hidden_states=False)
|
15 |
+
last_hidden_states = outputs.last_hidden_state
|
16 |
+
input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
|
17 |
+
sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
|
18 |
+
sum_mask = input_mask_expanded.sum(1)
|
19 |
+
mean_pooled = sum_embeddings / sum_mask
|
20 |
+
mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy())
|
21 |
+
return np.array(mean_pooled_batch)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|