jfrery-zama commited on
Commit
1dfccc3
β€’
1 Parent(s): 7552fa2

update representation with roberta + new fast model

Browse files
deployment/client.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:147cf22982f4eb5198ad222bc290c0b3c36d5e15969294eb6a0a6f203c692c78
3
- size 129874
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81c8de4328853bc4b3df668ea1a174b6ed4d9d086c1a2cf544e3db639ac43b92
3
+ size 30438
deployment/server.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d24632c72f335b8bff44a62fde127eaf94e6eb21c9b2343c74fdcb2abdbf5747
3
- size 5637
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52100230351a158351fdede68d1bca31e1473e1f9f8abe6a71a0a95c7191b18e
3
+ size 6333
deployment/versions.json CHANGED
@@ -1 +1 @@
1
- {"concrete-python": "2.5.1", "concrete-ml": "1.5.0-rc0", "python": "3.8.16"}
 
1
+ {"concrete-python": "2.5.1", "concrete-ml": "1.5.0-rc0", "python": "3.10.12"}
fhe_anonymizer.py CHANGED
@@ -5,6 +5,8 @@ from pathlib import Path
5
  from concrete.ml.common.serialization.loaders import load
6
  import uuid
7
  import json
 
 
8
 
9
  base_dir = Path(__file__).parent
10
 
@@ -12,9 +14,10 @@ base_dir = Path(__file__).parent
12
  class FHEAnonymizer:
13
  def __init__(self, punctuation_list=".,!?:;"):
14
 
15
- self.embeddings_model = gensim.models.FastText.load(
16
- str(base_dir / "models/without_pronoun_embedded_model.model")
17
- )
 
18
  self.punctuation_list = punctuation_list
19
  with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
20
  self.fhe_ner_detection = load(file=model_file)
@@ -49,7 +52,8 @@ class FHEAnonymizer:
49
  continue
50
 
51
  # Prediction for each word
52
- x = self.embeddings_model.wv[token][None]
 
53
  # prediction_proba = self.fhe_ner_detection.predict_proba(x)
54
  prediction_proba = self.fhe_inference(x)
55
  probability = prediction_proba[0][1]
 
5
  from concrete.ml.common.serialization.loaders import load
6
  import uuid
7
  import json
8
+ from transformers import AutoTokenizer, AutoModel
9
+ from utils_demo import get_batch_text_representation
10
 
11
  base_dir = Path(__file__).parent
12
 
 
14
  class FHEAnonymizer:
15
  def __init__(self, punctuation_list=".,!?:;"):
16
 
17
+ # Load tokenizer and model, move model to the selected device
18
+ self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
19
+ self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
20
+
21
  self.punctuation_list = punctuation_list
22
  with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
23
  self.fhe_ner_detection = load(file=model_file)
 
52
  continue
53
 
54
  # Prediction for each word
55
+ x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
56
+
57
  # prediction_proba = self.fhe_ner_detection.predict_proba(x)
58
  prediction_proba = self.fhe_inference(x)
59
  probability = prediction_proba[0][1]
models/cml_xgboost.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:431175c3c2bd7591ebfffa3ea45b1096dda5ba7588291252994f9be31db35534
3
- size 6625266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea897cbdde4c99e439b788615a664083cfd57c0b3259407c0830a70e5fb45b05
3
+ size 82842
utils_demo.py CHANGED
@@ -1,24 +1,22 @@
1
- import uuid
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def process_tokens(tokens, inverse_uuid_map=None, uuid_map=None, embeddings_model=None, fhe_ner_detection=None, client=None):
4
- """Processes tokens based on the provided parameters for either deanonymizing, anonymizing or default processing."""
5
- processed_tokens = []
6
- for token in tokens:
7
- if not token.strip() or not re.match(r"\w+", token): # Directly append non-word tokens or whitespace
8
- processed_tokens.append(token)
9
- continue
10
- if inverse_uuid_map is not None: # For deanonymizing response
11
- processed_tokens.append(inverse_uuid_map.get(token, token))
12
- elif uuid_map is not None and embeddings_model is not None and fhe_ner_detection is not None and client is not None: # For FHEAnonymizer call
13
- x = embeddings_model.wv[token][None]
14
- prediction_proba = fhe_ner_detection.predict_proba(x)
15
- probability = prediction_proba[0][1]
16
- if probability >= 0.5:
17
- tmp_uuid = uuid_map.get(token, str(uuid.uuid4())[:8])
18
- processed_tokens.append(tmp_uuid)
19
- uuid_map[token] = tmp_uuid
20
- else:
21
- processed_tokens.append(token)
22
- else:
23
- processed_tokens.append(token)
24
- return ''.join(processed_tokens)
 
1
+ import torch
2
+ import numpy as np
3
+ import random
4
+
5
+ def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
6
+ """
7
+ Get mean-pooled representations of given texts in batches.
8
+ """
9
+ mean_pooled_batch = []
10
+ for i in range(0, len(texts), batch_size):
11
+ batch_texts = texts[i:i+batch_size]
12
+ inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
13
+ with torch.no_grad():
14
+ outputs = model(**inputs, output_hidden_states=False)
15
+ last_hidden_states = outputs.last_hidden_state
16
+ input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
17
+ sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
18
+ sum_mask = input_mask_expanded.sum(1)
19
+ mean_pooled = sum_embeddings / sum_mask
20
+ mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy())
21
+ return np.array(mean_pooled_batch)
22