jfrery-zama
commited on
Commit
β’
d0b1031
1
Parent(s):
1dfccc3
update anonymize file in clear with roberta +update uuid map with query id
Browse files- anonymize_file_clear.py +13 -7
- app.py +1 -1
- fhe_anonymizer.py +5 -5
anonymize_file_clear.py
CHANGED
@@ -5,15 +5,21 @@ import uuid
|
|
5 |
from pathlib import Path
|
6 |
import gensim
|
7 |
from concrete.ml.common.serialization.loaders import load
|
|
|
|
|
8 |
|
9 |
def load_models():
|
10 |
base_dir = Path(__file__).parent / "models"
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
fhe_ner_detection = load(file=model_file)
|
14 |
-
return embeddings_model, fhe_ner_detection
|
15 |
|
16 |
-
def anonymize_text(text, embeddings_model, fhe_ner_detection):
|
17 |
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
18 |
tokens = re.findall(token_pattern, text)
|
19 |
uuid_map = {}
|
@@ -21,7 +27,7 @@ def anonymize_text(text, embeddings_model, fhe_ner_detection):
|
|
21 |
|
22 |
for token in tokens:
|
23 |
if token.strip() and re.match(r"\w+", token): # If the token is a word
|
24 |
-
x =
|
25 |
prediction_proba = fhe_ner_detection.predict_proba(x)
|
26 |
probability = prediction_proba[0][1]
|
27 |
prediction = probability >= 0.5
|
@@ -42,7 +48,7 @@ def main():
|
|
42 |
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
|
43 |
args = parser.parse_args()
|
44 |
|
45 |
-
embeddings_model, fhe_ner_detection = load_models()
|
46 |
|
47 |
# Read the input file
|
48 |
with open(args.file_path, 'r', encoding='utf-8') as file:
|
@@ -54,7 +60,7 @@ def main():
|
|
54 |
original_file.write(text)
|
55 |
|
56 |
# Anonymize the text
|
57 |
-
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
|
58 |
|
59 |
# Save the anonymized text to its specified file
|
60 |
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
|
|
|
5 |
from pathlib import Path
|
6 |
import gensim
|
7 |
from concrete.ml.common.serialization.loaders import load
|
8 |
+
from transformers import AutoTokenizer, AutoModel
|
9 |
+
from utils_demo import get_batch_text_representation
|
10 |
|
11 |
def load_models():
|
12 |
base_dir = Path(__file__).parent / "models"
|
13 |
+
|
14 |
+
# Load tokenizer and model
|
15 |
+
tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
16 |
+
embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
17 |
+
|
18 |
+
with open(base_dir / "cml_logreg.model", "r") as model_file:
|
19 |
fhe_ner_detection = load(file=model_file)
|
20 |
+
return embeddings_model, tokenizer, fhe_ner_detection
|
21 |
|
22 |
+
def anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection):
|
23 |
token_pattern = r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)"
|
24 |
tokens = re.findall(token_pattern, text)
|
25 |
uuid_map = {}
|
|
|
27 |
|
28 |
for token in tokens:
|
29 |
if token.strip() and re.match(r"\w+", token): # If the token is a word
|
30 |
+
x = get_batch_text_representation([token], embeddings_model, tokenizer)
|
31 |
prediction_proba = fhe_ner_detection.predict_proba(x)
|
32 |
probability = prediction_proba[0][1]
|
33 |
prediction = probability >= 0.5
|
|
|
48 |
parser.add_argument("file_path", type=str, help="The path to the file to be processed.")
|
49 |
args = parser.parse_args()
|
50 |
|
51 |
+
embeddings_model, tokenizer, fhe_ner_detection = load_models()
|
52 |
|
53 |
# Read the input file
|
54 |
with open(args.file_path, 'r', encoding='utf-8') as file:
|
|
|
60 |
original_file.write(text)
|
61 |
|
62 |
# Anonymize the text
|
63 |
+
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, tokenizer, fhe_ner_detection)
|
64 |
|
65 |
# Save the anonymized text to its specified file
|
66 |
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
|
app.py
CHANGED
@@ -142,7 +142,7 @@ with demo:
|
|
142 |
|
143 |
examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
|
144 |
|
145 |
-
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1)
|
146 |
|
147 |
identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
|
148 |
|
|
|
142 |
|
143 |
examples_radio.change(lambda example_query: example_query, inputs=[examples_radio], outputs=[input_text])
|
144 |
|
145 |
+
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=1, interactive=True)
|
146 |
|
147 |
identified_words_output = gr.Dataframe(label="Identified Words", visible=False)
|
148 |
|
fhe_anonymizer.py
CHANGED
@@ -14,13 +14,11 @@ base_dir = Path(__file__).parent
|
|
14 |
class FHEAnonymizer:
|
15 |
def __init__(self, punctuation_list=".,!?:;"):
|
16 |
|
17 |
-
# Load tokenizer and model
|
18 |
self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
19 |
self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
20 |
|
21 |
self.punctuation_list = punctuation_list
|
22 |
-
with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
|
23 |
-
self.fhe_ner_detection = load(file=model_file)
|
24 |
|
25 |
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
26 |
self.uuid_map = json.load(file)
|
@@ -44,7 +42,6 @@ class FHEAnonymizer:
|
|
44 |
identified_words_with_prob = []
|
45 |
processed_tokens = []
|
46 |
|
47 |
-
print(tokens)
|
48 |
for token in tokens:
|
49 |
# Directly append non-word tokens or whitespace to processed_tokens
|
50 |
if not token.strip() or not re.match(r"\w+", token):
|
@@ -54,7 +51,6 @@ class FHEAnonymizer:
|
|
54 |
# Prediction for each word
|
55 |
x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
|
56 |
|
57 |
-
# prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
58 |
prediction_proba = self.fhe_inference(x)
|
59 |
probability = prediction_proba[0][1]
|
60 |
|
@@ -68,6 +64,10 @@ class FHEAnonymizer:
|
|
68 |
else:
|
69 |
processed_tokens.append(token)
|
70 |
|
|
|
|
|
|
|
|
|
71 |
# Reconstruct the sentence
|
72 |
reconstructed_sentence = ''.join(processed_tokens)
|
73 |
return reconstructed_sentence, identified_words_with_prob
|
|
|
14 |
class FHEAnonymizer:
|
15 |
def __init__(self, punctuation_list=".,!?:;"):
|
16 |
|
17 |
+
# Load tokenizer and model
|
18 |
self.tokenizer = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
|
19 |
self.embeddings_model = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
20 |
|
21 |
self.punctuation_list = punctuation_list
|
|
|
|
|
22 |
|
23 |
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
24 |
self.uuid_map = json.load(file)
|
|
|
42 |
identified_words_with_prob = []
|
43 |
processed_tokens = []
|
44 |
|
|
|
45 |
for token in tokens:
|
46 |
# Directly append non-word tokens or whitespace to processed_tokens
|
47 |
if not token.strip() or not re.match(r"\w+", token):
|
|
|
51 |
# Prediction for each word
|
52 |
x = get_batch_text_representation([token], self.embeddings_model, self.tokenizer)
|
53 |
|
|
|
54 |
prediction_proba = self.fhe_inference(x)
|
55 |
probability = prediction_proba[0][1]
|
56 |
|
|
|
64 |
else:
|
65 |
processed_tokens.append(token)
|
66 |
|
67 |
+
# Update the UUID map with query.
|
68 |
+
with open(base_dir / "original_document_uuid_mapping.json", 'w') as file:
|
69 |
+
json.dump(self.uuid_map, file)
|
70 |
+
|
71 |
# Reconstruct the sentence
|
72 |
reconstructed_sentence = ''.join(processed_tokens)
|
73 |
return reconstructed_sentence, identified_words_with_prob
|