Spaces:
Sleeping
Sleeping
jfrery-zama
commited on
Commit
β’
628fe8f
1
Parent(s):
be67bc2
use without pronoun model
Browse files- anonymize_file_clear.py +18 -5
- app.py +3 -3
- demo_text.txt +1 -1
- deployment/client.zip +2 -2
- deployment/server.zip +2 -2
- fhe_anonymizer.py +4 -3
- files/anonymized_document.txt +6 -6
- files/question_demo.txt +0 -10
- cml_xgboost.model β models/cml_xgboost.model +0 -0
- embedded_model.model β models/embedded_model.model +0 -0
- embedded_model.model.wv.vectors_ngrams.npy β models/embedded_model.model.wv.vectors_ngrams.npy +0 -0
- models/without_pronoun_cml_xgboost.model +3 -0
- models/without_pronoun_embedded_model.model +3 -0
- models/without_pronoun_embedded_model.model.wv.vectors_ngrams.npy +3 -0
- original_document_uuid_mapping.json +33 -32
- utils_demo.py +24 -0
anonymize_file_clear.py
CHANGED
@@ -7,9 +7,9 @@ import gensim
|
|
7 |
from concrete.ml.common.serialization.loaders import load
|
8 |
|
9 |
def load_models():
|
10 |
-
base_dir = Path(__file__).parent
|
11 |
-
embeddings_model = gensim.models.FastText.load(str(base_dir / "
|
12 |
-
with open(base_dir / "
|
13 |
fhe_ner_detection = load(file=model_file)
|
14 |
return embeddings_model, fhe_ner_detection
|
15 |
|
@@ -34,7 +34,8 @@ def anonymize_text(text, embeddings_model, fhe_ner_detection):
|
|
34 |
else:
|
35 |
processed_tokens.append(token) # Preserve punctuation and spaces as is
|
36 |
|
37 |
-
|
|
|
38 |
|
39 |
def main():
|
40 |
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
|
@@ -47,14 +48,26 @@ def main():
|
|
47 |
with open(args.file_path, 'r', encoding='utf-8') as file:
|
48 |
text = file.read()
|
49 |
|
|
|
|
|
|
|
|
|
|
|
50 |
# Anonymize the text
|
51 |
-
uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Save the UUID mapping to a JSON file
|
54 |
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
|
55 |
with open(mapping_path, 'w', encoding='utf-8') as file:
|
56 |
json.dump(uuid_map, file, indent=4, sort_keys=True)
|
57 |
|
|
|
|
|
58 |
print(f"UUID mapping saved to {mapping_path}")
|
59 |
|
60 |
if __name__ == "__main__":
|
|
|
7 |
from concrete.ml.common.serialization.loaders import load
|
8 |
|
9 |
def load_models():
|
10 |
+
base_dir = Path(__file__).parent / "models"
|
11 |
+
embeddings_model = gensim.models.FastText.load(str(base_dir / "without_pronoun_embedded_model.model"))
|
12 |
+
with open(base_dir / "without_pronoun_cml_xgboost.model", "r") as model_file:
|
13 |
fhe_ner_detection = load(file=model_file)
|
14 |
return embeddings_model, fhe_ner_detection
|
15 |
|
|
|
34 |
else:
|
35 |
processed_tokens.append(token) # Preserve punctuation and spaces as is
|
36 |
|
37 |
+
anonymized_text = ''.join(processed_tokens)
|
38 |
+
return anonymized_text, uuid_map
|
39 |
|
40 |
def main():
|
41 |
parser = argparse.ArgumentParser(description="Anonymize named entities in a text file and save the mapping to a JSON file.")
|
|
|
48 |
with open(args.file_path, 'r', encoding='utf-8') as file:
|
49 |
text = file.read()
|
50 |
|
51 |
+
# Save the original text to its specified file
|
52 |
+
original_file_path = Path(__file__).parent / "files" / "original_document.txt"
|
53 |
+
with open(original_file_path, 'w', encoding='utf-8') as original_file:
|
54 |
+
original_file.write(text)
|
55 |
+
|
56 |
# Anonymize the text
|
57 |
+
anonymized_text, uuid_map = anonymize_text(text, embeddings_model, fhe_ner_detection)
|
58 |
+
|
59 |
+
# Save the anonymized text to its specified file
|
60 |
+
anonymized_file_path = Path(__file__).parent / "files" / "anonymized_document.txt"
|
61 |
+
with open(anonymized_file_path, 'w', encoding='utf-8') as anonymized_file:
|
62 |
+
anonymized_file.write(anonymized_text)
|
63 |
|
64 |
# Save the UUID mapping to a JSON file
|
65 |
mapping_path = Path(args.file_path).stem + "_uuid_mapping.json"
|
66 |
with open(mapping_path, 'w', encoding='utf-8') as file:
|
67 |
json.dump(uuid_map, file, indent=4, sort_keys=True)
|
68 |
|
69 |
+
print(f"Original text saved to {original_file_path}")
|
70 |
+
print(f"Anonymized text saved to {anonymized_file_path}")
|
71 |
print(f"UUID mapping saved to {mapping_path}")
|
72 |
|
73 |
if __name__ == "__main__":
|
app.py
CHANGED
@@ -59,13 +59,13 @@ def query_chatgpt(anonymized_query):
|
|
59 |
tokens = re.findall(token_pattern, anonymized_response)
|
60 |
processed_tokens = []
|
61 |
|
62 |
-
|
63 |
for token in tokens:
|
64 |
# Directly append non-word tokens or whitespace to processed_tokens
|
65 |
if not token.strip() or not re.match(r"\w+", token):
|
66 |
processed_tokens.append(token)
|
67 |
continue
|
68 |
-
|
69 |
if token in inverse_uuid_map:
|
70 |
processed_tokens.append(inverse_uuid_map[token])
|
71 |
else:
|
@@ -136,7 +136,7 @@ with demo:
|
|
136 |
|
137 |
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=13)
|
138 |
|
139 |
-
identified_words_output = gr.Dataframe(label="Identified Words", visible=
|
140 |
|
141 |
submit_button = gr.Button("Anonymize with FHE")
|
142 |
|
|
|
59 |
tokens = re.findall(token_pattern, anonymized_response)
|
60 |
processed_tokens = []
|
61 |
|
62 |
+
|
63 |
for token in tokens:
|
64 |
# Directly append non-word tokens or whitespace to processed_tokens
|
65 |
if not token.strip() or not re.match(r"\w+", token):
|
66 |
processed_tokens.append(token)
|
67 |
continue
|
68 |
+
|
69 |
if token in inverse_uuid_map:
|
70 |
processed_tokens.append(inverse_uuid_map[token])
|
71 |
else:
|
|
|
136 |
|
137 |
anonymized_text_output = gr.Textbox(label="Anonymized Text with FHE", lines=13)
|
138 |
|
139 |
+
identified_words_output = gr.Dataframe(label="Identified Words", visible=True)
|
140 |
|
141 |
submit_button = gr.Button("Anonymize with FHE")
|
142 |
|
demo_text.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
|
|
|
1 |
+
who lives in Maine?
|
deployment/client.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2e4798bf93e38a14f5f1aa15203bb093cf15c4dfee7edbd8e0f7767605755090
|
3 |
+
size 129876
|
deployment/server.zip
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:86fc185025bf0c84d50aef44f14308c0077db2701a9a835bf4e3b912a58cb9b0
|
3 |
+
size 5639
|
fhe_anonymizer.py
CHANGED
@@ -13,10 +13,10 @@ class FHEAnonymizer:
|
|
13 |
def __init__(self, punctuation_list=".,!?:;"):
|
14 |
|
15 |
self.embeddings_model = gensim.models.FastText.load(
|
16 |
-
str(base_dir / "
|
17 |
)
|
18 |
self.punctuation_list = punctuation_list
|
19 |
-
with open(base_dir / "
|
20 |
self.fhe_ner_detection = load(file=model_file)
|
21 |
|
22 |
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
@@ -50,7 +50,8 @@ class FHEAnonymizer:
|
|
50 |
|
51 |
# Prediction for each word
|
52 |
x = self.embeddings_model.wv[token][None]
|
53 |
-
prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
|
|
54 |
probability = prediction_proba[0][1]
|
55 |
|
56 |
if probability >= 0.5:
|
|
|
13 |
def __init__(self, punctuation_list=".,!?:;"):
|
14 |
|
15 |
self.embeddings_model = gensim.models.FastText.load(
|
16 |
+
str(base_dir / "models/without_pronoun_embedded_model.model")
|
17 |
)
|
18 |
self.punctuation_list = punctuation_list
|
19 |
+
with open(base_dir / "models/without_pronoun_cml_xgboost.model", "r") as model_file:
|
20 |
self.fhe_ner_detection = load(file=model_file)
|
21 |
|
22 |
with open(base_dir / "original_document_uuid_mapping.json", 'r') as file:
|
|
|
50 |
|
51 |
# Prediction for each word
|
52 |
x = self.embeddings_model.wv[token][None]
|
53 |
+
# prediction_proba = self.fhe_ner_detection.predict_proba(x)
|
54 |
+
prediction_proba = self.fhe_inference(x)
|
55 |
probability = prediction_proba[0][1]
|
56 |
|
57 |
if probability >= 0.5:
|
files/anonymized_document.txt
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
-
Hello,
|
2 |
-
|
3 |
|
4 |
-
On
|
5 |
|
6 |
-
|
7 |
|
8 |
-
|
9 |
|
10 |
-
|
|
|
1 |
+
Hello, my name is 97dc4202 7ce27ecb and I live in aaf4b006.
|
2 |
+
My credit card number is bfd59a59 and my c7184a17 516361a1 f8380bf5 is edf660df.
|
3 |
|
4 |
+
On d615d819 3f343449 I visited b6394fb9 and 732237ac an email to a295c5d0, from the c591bc5d c83dd929.
|
5 |
|
6 |
+
My passport: c263d176 and my c402f998 number: a054c8c2 8fddc160.
|
7 |
|
8 |
+
This is a 08876c6f c5462fed 49b9cffb 3658044b Number: 2f075e1d . Can you please e51c8e1c the f1b9c36f on 4fd4e4c4 aa960526 148fea84?
|
9 |
|
10 |
+
a18f3dda's bcda6774 security number is 48f7c8a4. Her driver license? it is ab7ec0c3.
|
files/question_demo.txt
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
Strategic Focus: What are the primary areas of focus in the strategic development plan for the technology firm, and why were they chosen?
|
2 |
-
Revenue Growth: How does the company plan to achieve a 20% increase in revenue through expansion into emerging markets?
|
3 |
-
Innovation Investment: What specific types of AI algorithms and cloud solutions is the company planning to develop with the allocated $100 million R&D investment?
|
4 |
-
Partnerships and Acquisitions Strategy: Can you explain the criteria used for selecting startups for partnerships and acquisitions in the AI and IoT sectors?
|
5 |
-
Risk Management: What are the key risks identified for the company, and what strategies are in place to mitigate these risks?
|
6 |
-
Financial Projections: Based on the strategic initiatives outlined, what are the projected financial outcomes for the company over the next three years?
|
7 |
-
Market Competition: How does the company plan to continuously analyze and adapt to competitive strategies in the technology sector?
|
8 |
-
Regulatory Compliance: What measures will the company take to ensure adherence to global regulations, especially when expanding into new markets?
|
9 |
-
Product Portfolio: How will the strategic partnerships and acquisitions enhance the company's product portfolio?
|
10 |
-
Sustainability: What steps is the company taking to ensure the sustainability of its growth in the face of technological changes and market competition?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cml_xgboost.model β models/cml_xgboost.model
RENAMED
File without changes
|
embedded_model.model β models/embedded_model.model
RENAMED
File without changes
|
embedded_model.model.wv.vectors_ngrams.npy β models/embedded_model.model.wv.vectors_ngrams.npy
RENAMED
File without changes
|
models/without_pronoun_cml_xgboost.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:933d1d5c5f83c30211dd9a497482c517a822df809c0498fed164de72bd7bf910
|
3 |
+
size 1085795
|
models/without_pronoun_embedded_model.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:762240ca4040c68e44c403f16abce5683a0c4a005ec10f3dd0135a0e429a66c1
|
3 |
+
size 1189196
|
models/without_pronoun_embedded_model.model.wv.vectors_ngrams.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5cf06fe78185b373c97ee0616f599ce6b1aceb6445b8f666fac6cd4cd307fe46
|
3 |
+
size 400000128
|
original_document_uuid_mapping.json
CHANGED
@@ -1,34 +1,35 @@
|
|
1 |
{
|
2 |
-
"078-05-1126": "
|
3 |
-
"1234567A": "
|
4 |
-
"16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "
|
5 |
-
"18": "
|
6 |
-
"191280342": "
|
7 |
-
"192.168.0.1": "
|
8 |
-
"212": "
|
9 |
-
"4095-2609-9393-4932": "
|
10 |
-
"555-1234": "
|
11 |
-
"954567876544": "
|
12 |
-
"Account": "
|
13 |
-
"Bank": "
|
14 |
-
"David": "
|
15 |
-
"
|
16 |
-
"
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"
|
20 |
-
"
|
21 |
-
"
|
22 |
-
"
|
23 |
-
"
|
24 |
-
"
|
25 |
-
"
|
26 |
-
"
|
27 |
-
"microsoft.com": "
|
28 |
-
"
|
29 |
-
"
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"
|
33 |
-
"
|
|
|
34 |
}
|
|
|
1 |
{
|
2 |
+
"078-05-1126": "48f7c8a4",
|
3 |
+
"1234567A": "ab7ec0c3",
|
4 |
+
"16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "edf660df",
|
5 |
+
"18": "3f343449",
|
6 |
+
"191280342": "c263d176",
|
7 |
+
"192.168.0.1": "c83dd929",
|
8 |
+
"212": "a054c8c2",
|
9 |
+
"4095-2609-9393-4932": "bfd59a59",
|
10 |
+
"555-1234": "8fddc160",
|
11 |
+
"954567876544": "148fea84",
|
12 |
+
"Account": "3658044b",
|
13 |
+
"Bank": "49b9cffb",
|
14 |
+
"David": "97dc4202",
|
15 |
+
"IL150120690000003111111": "2f075e1d",
|
16 |
+
"IP": "c591bc5d",
|
17 |
+
"International": "c5462fed",
|
18 |
+
"Johnson": "7ce27ecb",
|
19 |
+
"Kate": "a18f3dda",
|
20 |
+
"Maine": "aaf4b006",
|
21 |
+
"September": "d615d819",
|
22 |
+
"account": "aa960526",
|
23 |
+
"bank": "4fd4e4c4",
|
24 |
+
"check": "e51c8e1c",
|
25 |
+
"crypto": "c7184a17",
|
26 |
+
"id": "f8380bf5",
|
27 |
+
"microsoft.com": "b6394fb9",
|
28 |
+
"phone": "c402f998",
|
29 |
+
"sent": "732237ac",
|
30 |
+
"social": "bcda6774",
|
31 |
+
"status": "f1b9c36f",
|
32 |
+
"test@presidio.site": "a295c5d0",
|
33 |
+
"valid": "08876c6f",
|
34 |
+
"wallet": "516361a1"
|
35 |
}
|
utils_demo.py
CHANGED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
|
3 |
+
def process_tokens(tokens, inverse_uuid_map=None, uuid_map=None, embeddings_model=None, fhe_ner_detection=None, client=None):
|
4 |
+
"""Processes tokens based on the provided parameters for either deanonymizing, anonymizing or default processing."""
|
5 |
+
processed_tokens = []
|
6 |
+
for token in tokens:
|
7 |
+
if not token.strip() or not re.match(r"\w+", token): # Directly append non-word tokens or whitespace
|
8 |
+
processed_tokens.append(token)
|
9 |
+
continue
|
10 |
+
if inverse_uuid_map is not None: # For deanonymizing response
|
11 |
+
processed_tokens.append(inverse_uuid_map.get(token, token))
|
12 |
+
elif uuid_map is not None and embeddings_model is not None and fhe_ner_detection is not None and client is not None: # For FHEAnonymizer call
|
13 |
+
x = embeddings_model.wv[token][None]
|
14 |
+
prediction_proba = fhe_ner_detection.predict_proba(x)
|
15 |
+
probability = prediction_proba[0][1]
|
16 |
+
if probability >= 0.5:
|
17 |
+
tmp_uuid = uuid_map.get(token, str(uuid.uuid4())[:8])
|
18 |
+
processed_tokens.append(tmp_uuid)
|
19 |
+
uuid_map[token] = tmp_uuid
|
20 |
+
else:
|
21 |
+
processed_tokens.append(token)
|
22 |
+
else:
|
23 |
+
processed_tokens.append(token)
|
24 |
+
return ''.join(processed_tokens)
|