Spaces:
Sleeping
Sleeping
chore: update Marketing v2
Browse files- app.py +90 -59
- files/anonymized_document.txt +5 -5
- files/mapping_clear_to_anonymized.pkl +2 -2
- files/mapping_clear_to_encrypted.pkl +2 -2
- files/mapping_doc_embedding_path.pkl +3 -0
- files/original_document.txt +2 -2
- files/original_document_uuid_mapping.json +6 -8
- utils_demo.py +4 -1
app.py
CHANGED
@@ -35,6 +35,7 @@ ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
|
|
35 |
MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
|
36 |
MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
|
37 |
ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
|
|
|
38 |
print(ORIGINAL_DOCUMENT)
|
39 |
|
40 |
# 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
|
@@ -54,7 +55,7 @@ def select_static_anonymized_sentences_fn(selected_sentences: List):
|
|
54 |
|
55 |
anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
|
56 |
|
57 |
-
return
|
58 |
|
59 |
|
60 |
def key_gen_fn() -> Dict:
|
@@ -92,23 +93,48 @@ def key_gen_fn() -> Dict:
|
|
92 |
print("Keys have been generated ✅")
|
93 |
return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
|
94 |
|
95 |
-
def select_static_encrypted_sentences_fn(selected_sentences: List):
|
96 |
|
97 |
-
|
98 |
|
99 |
-
|
100 |
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
|
|
|
104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
|
106 |
def encrypt_query_fn(query):
|
107 |
|
108 |
print(f"\n------------ Step 2: Query encryption: {query=}")
|
109 |
|
110 |
if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
|
111 |
-
return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!")}
|
112 |
|
113 |
if is_user_query_valid(query):
|
114 |
return {
|
@@ -156,8 +182,8 @@ def encrypt_query_fn(query):
|
|
156 |
encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
|
157 |
|
158 |
return {
|
159 |
-
output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=
|
160 |
-
|
161 |
identified_words_output_df: gr.update(visible=False, value=None),
|
162 |
}
|
163 |
|
@@ -176,14 +202,14 @@ def send_input_fn(query) -> Dict:
|
|
176 |
"Error Encountered While Sending Data to the Server: "
|
177 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
178 |
)
|
179 |
-
return {
|
180 |
|
181 |
if not encrypted_input_path.is_file():
|
182 |
error_message = (
|
183 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
184 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
185 |
)
|
186 |
-
return {
|
187 |
|
188 |
# Define the data and files to post
|
189 |
data = {"user_id": USER_ID, "input": query}
|
@@ -218,14 +244,14 @@ def run_fhe_in_server_fn() -> Dict:
|
|
218 |
"Error Encountered While Sending Data to the Server: "
|
219 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
220 |
)
|
221 |
-
return {
|
222 |
|
223 |
if not encrypted_input_path.is_file():
|
224 |
error_message = (
|
225 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
226 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
227 |
)
|
228 |
-
return {
|
229 |
|
230 |
data = {
|
231 |
"user_id": USER_ID,
|
@@ -239,7 +265,7 @@ def run_fhe_in_server_fn() -> Dict:
|
|
239 |
) as response:
|
240 |
if not response.ok:
|
241 |
return {
|
242 |
-
|
243 |
value=(
|
244 |
"⚠️ An error occurred on the Server Side. "
|
245 |
"Please check connectivity and data transmission."
|
@@ -260,14 +286,14 @@ def get_output_fn() -> Dict:
|
|
260 |
"Error Encountered While Sending Data to the Server: "
|
261 |
"The key has not been generated correctly"
|
262 |
)
|
263 |
-
return {
|
264 |
|
265 |
if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
|
266 |
error_message = (
|
267 |
"Error Encountered While Sending Data to the Server: "
|
268 |
"The data has not been encrypted correctly on the client side"
|
269 |
)
|
270 |
-
return {
|
271 |
|
272 |
data = {
|
273 |
"user_id": USER_ID,
|
@@ -372,7 +398,7 @@ def decrypt_fn(text) -> Dict:
|
|
372 |
return anonymized_text, identified_df
|
373 |
|
374 |
|
375 |
-
def anonymization_with_fn(query):
|
376 |
|
377 |
encrypt_query_fn(query)
|
378 |
|
@@ -385,8 +411,9 @@ def anonymization_with_fn(query):
|
|
385 |
anonymized_text, identified_df = decrypt_fn(query)
|
386 |
|
387 |
return {
|
388 |
-
|
389 |
-
|
|
|
390 |
}
|
391 |
|
392 |
|
@@ -402,10 +429,9 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
|
|
402 |
error_message = "Error ❌: Please encrypt your query first!"
|
403 |
return {chatgpt_response_anonymized: gr.update(value=error_message)}
|
404 |
|
405 |
-
|
406 |
|
407 |
# Prepare prompt
|
408 |
-
initial_prompt = prompt + "\n"
|
409 |
query = (
|
410 |
"Document content:\n```\n"
|
411 |
+ anonymized_document
|
@@ -414,12 +440,12 @@ def query_chatgpt_fn(anonymized_query, anonymized_document):
|
|
414 |
+ anonymized_query
|
415 |
+ "\n```"
|
416 |
)
|
417 |
-
print(f'
|
418 |
|
419 |
completion = client.chat.completions.create(
|
420 |
model="gpt-4-1106-preview", # Replace with "gpt-4" if available
|
421 |
messages=[
|
422 |
-
{"role": "system", "content":
|
423 |
{"role": "user", "content": query},
|
424 |
],
|
425 |
)
|
@@ -472,26 +498,31 @@ with demo:
|
|
472 |
"""
|
473 |
)
|
474 |
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
"""
|
494 |
-
|
|
|
|
|
|
|
|
|
|
|
495 |
|
496 |
########################## Key Gen Part ##########################
|
497 |
|
@@ -535,16 +566,10 @@ with demo:
|
|
535 |
encrypt_doc_btn = gr.Button("Encrypt the document")
|
536 |
|
537 |
with gr.Column(scale=5):
|
538 |
-
|
539 |
-
label="Encrypted document:",
|
540 |
-
show_label=True, value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
|
541 |
)
|
542 |
|
543 |
-
original_sentences_box.change(
|
544 |
-
fn=select_static_anonymized_sentences_fn,
|
545 |
-
inputs=[original_sentences_box],
|
546 |
-
outputs=[anonymized_doc_box],
|
547 |
-
)
|
548 |
|
549 |
########################## User Query Part ##########################
|
550 |
|
@@ -577,7 +602,7 @@ with demo:
|
|
577 |
|
578 |
with gr.Column(scale=1, min_width=6):
|
579 |
gr.HTML("<div style='height: 77px;'></div>")
|
580 |
-
|
581 |
# gr.HTML("<div style='height: 50px;'></div>")
|
582 |
|
583 |
with gr.Column(scale=5):
|
@@ -602,34 +627,40 @@ with demo:
|
|
602 |
with gr.Row():
|
603 |
with gr.Column(scale=5):
|
604 |
|
605 |
-
|
606 |
-
label="Decrypted and anonymized document", lines=
|
607 |
)
|
608 |
|
609 |
with gr.Column(scale=5):
|
610 |
|
611 |
anonymized_query_output = gr.Textbox(
|
612 |
-
label="Decrypted and anonymized prompt", lines=
|
613 |
)
|
614 |
|
615 |
|
616 |
identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
|
617 |
|
618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
619 |
fn=encrypt_query_fn,
|
620 |
inputs=[query_box],
|
621 |
outputs=[
|
622 |
query_box,
|
623 |
output_encrypted_box,
|
624 |
-
|
625 |
identified_words_output_df,
|
626 |
],
|
627 |
)
|
628 |
|
629 |
run_fhe_btn.click(
|
630 |
anonymization_with_fn,
|
631 |
-
inputs=[query_box],
|
632 |
-
outputs=[
|
633 |
)
|
634 |
|
635 |
########################## ChatGpt Part ##########################
|
@@ -651,7 +682,7 @@ with demo:
|
|
651 |
|
652 |
chatgpt_button.click(
|
653 |
query_chatgpt_fn,
|
654 |
-
inputs=[
|
655 |
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
|
656 |
)
|
657 |
|
|
|
35 |
MAPPING_ANONYMIZED_SENTENCES = read_pickle(MAPPING_ANONYMIZED_SENTENCES_PATH)
|
36 |
MAPPING_ENCRYPTED_SENTENCES = read_pickle(MAPPING_ENCRYPTED_SENTENCES_PATH)
|
37 |
ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
|
38 |
+
MAPPING_DOC_EMBEDDING = read_pickle(MAPPING_DOC_EMBEDDING_PATH)
|
39 |
print(ORIGINAL_DOCUMENT)
|
40 |
|
41 |
# 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
|
|
|
55 |
|
56 |
anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence]
|
57 |
|
58 |
+
return "\n\n".join(anonymized_selected_sentence)
|
59 |
|
60 |
|
61 |
def key_gen_fn() -> Dict:
|
|
|
93 |
print("Keys have been generated ✅")
|
94 |
return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
|
95 |
|
|
|
96 |
|
97 |
+
def encrypt_doc_fn(doc):
|
98 |
|
99 |
+
print(f"\n------------ Step 2.1: Doc encryption: {doc=}")
|
100 |
|
101 |
+
if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
|
102 |
+
return {encrypted_doc_box: gr.update(value="Error ❌: Please generate the key first!", lines=10)}
|
103 |
+
|
104 |
+
# Retrieve the client API
|
105 |
+
client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
|
106 |
+
client.load()
|
107 |
|
108 |
+
encrypted_tokens = []
|
109 |
+
tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", ' '.join(doc))
|
110 |
|
111 |
+
for token in tokens:
|
112 |
+
if token.strip() and re.match(r"\w+", token):
|
113 |
+
emb_x = MAPPING_DOC_EMBEDDING[token]
|
114 |
+
assert emb_x.shape == (1, 1024)
|
115 |
+
encrypted_x = client.quantize_encrypt_serialize(emb_x)
|
116 |
+
assert isinstance(encrypted_x, bytes)
|
117 |
+
encrypted_tokens.append(encrypted_x)
|
118 |
+
|
119 |
+
print("Doc encrypted ✅ on Client Side")
|
120 |
+
|
121 |
+
# No need to save it
|
122 |
+
# write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_doc", b"".join(encrypted_tokens))
|
123 |
+
|
124 |
+
encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
|
125 |
+
|
126 |
+
return {
|
127 |
+
encrypted_doc_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=10),
|
128 |
+
anonymized_doc_output: gr.update(visible=True, value=None),
|
129 |
+
}
|
130 |
+
|
131 |
|
132 |
def encrypt_query_fn(query):
|
133 |
|
134 |
print(f"\n------------ Step 2: Query encryption: {query=}")
|
135 |
|
136 |
if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
|
137 |
+
return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!", lines=8)}
|
138 |
|
139 |
if is_user_query_valid(query):
|
140 |
return {
|
|
|
182 |
encrypted_quant_tokens_hex = [token.hex()[500:580] for token in encrypted_tokens]
|
183 |
|
184 |
return {
|
185 |
+
output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex), lines=8),
|
186 |
+
anonymized_query_output: gr.update(visible=True, value=None),
|
187 |
identified_words_output_df: gr.update(visible=False, value=None),
|
188 |
}
|
189 |
|
|
|
202 |
"Error Encountered While Sending Data to the Server: "
|
203 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
204 |
)
|
205 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
206 |
|
207 |
if not encrypted_input_path.is_file():
|
208 |
error_message = (
|
209 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
210 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
211 |
)
|
212 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
213 |
|
214 |
# Define the data and files to post
|
215 |
data = {"user_id": USER_ID, "input": query}
|
|
|
244 |
"Error Encountered While Sending Data to the Server: "
|
245 |
f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
|
246 |
)
|
247 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
248 |
|
249 |
if not encrypted_input_path.is_file():
|
250 |
error_message = (
|
251 |
"Error Encountered While Sending Data to the Server: The data has not been encrypted "
|
252 |
f"correctly on the client side - {encrypted_input_path.is_file()=}"
|
253 |
)
|
254 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
255 |
|
256 |
data = {
|
257 |
"user_id": USER_ID,
|
|
|
265 |
) as response:
|
266 |
if not response.ok:
|
267 |
return {
|
268 |
+
anonymized_query_output: gr.update(
|
269 |
value=(
|
270 |
"⚠️ An error occurred on the Server Side. "
|
271 |
"Please check connectivity and data transmission."
|
|
|
286 |
"Error Encountered While Sending Data to the Server: "
|
287 |
"The key has not been generated correctly"
|
288 |
)
|
289 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
290 |
|
291 |
if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
|
292 |
error_message = (
|
293 |
"Error Encountered While Sending Data to the Server: "
|
294 |
"The data has not been encrypted correctly on the client side"
|
295 |
)
|
296 |
+
return {anonymized_query_output: gr.update(value=error_message)}
|
297 |
|
298 |
data = {
|
299 |
"user_id": USER_ID,
|
|
|
398 |
return anonymized_text, identified_df
|
399 |
|
400 |
|
401 |
+
def anonymization_with_fn(selected_sentences, query):
|
402 |
|
403 |
encrypt_query_fn(query)
|
404 |
|
|
|
411 |
anonymized_text, identified_df = decrypt_fn(query)
|
412 |
|
413 |
return {
|
414 |
+
anonymized_doc_output: gr.update(value=select_static_anonymized_sentences_fn(selected_sentences)),
|
415 |
+
anonymized_query_output: gr.update(value=anonymized_text),
|
416 |
+
identified_words_output_df: gr.update(value=identified_df, visible=False),
|
417 |
}
|
418 |
|
419 |
|
|
|
429 |
error_message = "Error ❌: Please encrypt your query first!"
|
430 |
return {chatgpt_response_anonymized: gr.update(value=error_message)}
|
431 |
|
432 |
+
context_prompt = read_txt(PROMPT_PATH)
|
433 |
|
434 |
# Prepare prompt
|
|
|
435 |
query = (
|
436 |
"Document content:\n```\n"
|
437 |
+ anonymized_document
|
|
|
440 |
+ anonymized_query
|
441 |
+ "\n```"
|
442 |
)
|
443 |
+
print(f'Prompt of CHATGPT:\n{query}')
|
444 |
|
445 |
completion = client.chat.completions.create(
|
446 |
model="gpt-4-1106-preview", # Replace with "gpt-4" if available
|
447 |
messages=[
|
448 |
+
{"role": "system", "content": context_prompt},
|
449 |
{"role": "user", "content": query},
|
450 |
],
|
451 |
)
|
|
|
498 |
"""
|
499 |
)
|
500 |
|
501 |
+
gr.Markdown(
|
502 |
+
"""
|
503 |
+
<p align="center" style="font-size: 16px;">
|
504 |
+
Anonymization is the process of removing personally identifiable information (PII) data from
|
505 |
+
a document in order to protect individual privacy.</p>
|
506 |
+
|
507 |
+
<p align="center" style="font-size: 16px;">
|
508 |
+
Encrypted anonymization uses Fully Homomorphic Encryption (FHE) to anonymize personally
|
509 |
+
identifiable information (PII) within encrypted documents, enabling computations to be
|
510 |
+
performed on the encrypted data.</p>
|
511 |
+
|
512 |
+
<p align="center" style="font-size: 16px;">
|
513 |
+
In the example above, we're showing how encrypted anonymization can be leveraged to use LLM
|
514 |
+
services such as ChaGPT in a privacy-preserving manner.</p>
|
515 |
+
"""
|
516 |
+
)
|
517 |
+
|
518 |
+
gr.Markdown(
|
519 |
"""
|
520 |
+
<p align="center">
|
521 |
+
<img width="75%" height="30%" src="https://raw.githubusercontent.com/kcelia/Img/main/fhe_anonymization_banner.png">
|
522 |
+
</p>
|
523 |
+
"""
|
524 |
+
)
|
525 |
+
|
526 |
|
527 |
########################## Key Gen Part ##########################
|
528 |
|
|
|
566 |
encrypt_doc_btn = gr.Button("Encrypt the document")
|
567 |
|
568 |
with gr.Column(scale=5):
|
569 |
+
encrypted_doc_box = gr.Textbox(
|
570 |
+
label="Encrypted document:", show_label=True, interactive=False, lines=10
|
|
|
571 |
)
|
572 |
|
|
|
|
|
|
|
|
|
|
|
573 |
|
574 |
########################## User Query Part ##########################
|
575 |
|
|
|
602 |
|
603 |
with gr.Column(scale=1, min_width=6):
|
604 |
gr.HTML("<div style='height: 77px;'></div>")
|
605 |
+
encrypt_query_btn = gr.Button("Encrypt the prompt")
|
606 |
# gr.HTML("<div style='height: 50px;'></div>")
|
607 |
|
608 |
with gr.Column(scale=5):
|
|
|
627 |
with gr.Row():
|
628 |
with gr.Column(scale=5):
|
629 |
|
630 |
+
anonymized_doc_output = gr.Textbox(
|
631 |
+
label="Decrypted and anonymized document", lines=10, interactive=True
|
632 |
)
|
633 |
|
634 |
with gr.Column(scale=5):
|
635 |
|
636 |
anonymized_query_output = gr.Textbox(
|
637 |
+
label="Decrypted and anonymized prompt", lines=10, interactive=True
|
638 |
)
|
639 |
|
640 |
|
641 |
identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
|
642 |
|
643 |
+
encrypt_doc_btn.click(
|
644 |
+
fn=encrypt_doc_fn,
|
645 |
+
inputs=[original_sentences_box],
|
646 |
+
outputs=[encrypted_doc_box, anonymized_doc_output],
|
647 |
+
)
|
648 |
+
|
649 |
+
encrypt_query_btn.click(
|
650 |
fn=encrypt_query_fn,
|
651 |
inputs=[query_box],
|
652 |
outputs=[
|
653 |
query_box,
|
654 |
output_encrypted_box,
|
655 |
+
anonymized_query_output,
|
656 |
identified_words_output_df,
|
657 |
],
|
658 |
)
|
659 |
|
660 |
run_fhe_btn.click(
|
661 |
anonymization_with_fn,
|
662 |
+
inputs=[original_sentences_box, query_box],
|
663 |
+
outputs=[anonymized_doc_output, anonymized_query_output, identified_words_output_df],
|
664 |
)
|
665 |
|
666 |
########################## ChatGpt Part ##########################
|
|
|
682 |
|
683 |
chatgpt_button.click(
|
684 |
query_chatgpt_fn,
|
685 |
+
inputs=[anonymized_query_output, anonymized_doc_output],
|
686 |
outputs=[chatgpt_response_anonymized, chatgpt_response_deanonymized],
|
687 |
)
|
688 |
|
files/anonymized_document.txt
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
-
Members:
|
2 |
|
3 |
-
Date:
|
4 |
|
5 |
-
Scope:
|
6 |
|
7 |
-
Amount: Bob agrees to pay
|
8 |
|
9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
10 |
|
11 |
-
Payment terms:
|
|
|
1 |
+
Members: a5989a5c and 20f545cf
|
2 |
|
3 |
+
Date: 7bbd0258 28ebebcd, 87a7f982
|
4 |
|
5 |
+
Scope: 20f545cf agrees to provide graphic design services to a5989a5c for the creation of a company logo.
|
6 |
|
7 |
+
Amount: Bob agrees to pay 20f545cf 500 upon completion and delivery of the logo.
|
8 |
|
9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
10 |
|
11 |
+
Payment terms: 20f545cf's international bank account N: 43a4c5f3
|
files/mapping_clear_to_anonymized.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aed1a1360ae82291357e5de8369d63d5514d90114743d1845b32642df9086902
|
3 |
+
size 906
|
files/mapping_clear_to_encrypted.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:45e4ba890f0b8c8d239534f9c6c1d0878f5419b62af6b32d9d7e758a0490ea8a
|
3 |
+
size 916
|
files/mapping_doc_embedding_path.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:faa0f74bc4358424e29118dc9714512f092d83756a77d596dd9ce56c9555b444
|
3 |
+
size 211319
|
files/original_document.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
Members: David
|
2 |
|
3 |
Date: February 06, 2000
|
4 |
|
@@ -8,4 +8,4 @@ Amount: Bob agrees to pay Kate $500 upon completion and delivery of the logo.
|
|
8 |
|
9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
10 |
|
11 |
-
Payment terms: Kate
|
|
|
1 |
+
Members: David and Kate
|
2 |
|
3 |
Date: February 06, 2000
|
4 |
|
|
|
8 |
|
9 |
Deadline: The logo design must be completed and delivered to Bob within 14 days of the contract signing date.
|
10 |
|
11 |
+
Payment terms: Kate's international bank account N°: IL150120690000003111111
|
files/original_document_uuid_mapping.json
CHANGED
@@ -1,10 +1,8 @@
|
|
1 |
{
|
2 |
-
"06": "
|
3 |
-
"2000": "
|
4 |
-
"David": "
|
5 |
-
"February": "
|
6 |
-
"
|
7 |
-
"
|
8 |
-
"Johnson": "70fc6ec5",
|
9 |
-
"Kate": "2708cb61"
|
10 |
}
|
|
|
1 |
{
|
2 |
+
"06": "28ebebcd",
|
3 |
+
"2000": "87a7f982",
|
4 |
+
"David": "a5989a5c",
|
5 |
+
"February": "7bbd0258",
|
6 |
+
"IL150120690000003111111": "43a4c5f3",
|
7 |
+
"Kate": "20f545cf"
|
|
|
|
|
8 |
}
|
utils_demo.py
CHANGED
@@ -40,6 +40,8 @@ ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
|
|
40 |
MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
|
41 |
MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
|
42 |
MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
|
|
|
|
|
43 |
PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
|
44 |
|
45 |
|
@@ -57,7 +59,8 @@ EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
|
|
57 |
PUNCTUATION_LIST = list(string.punctuation)
|
58 |
PUNCTUATION_LIST.remove("%")
|
59 |
PUNCTUATION_LIST.remove("$")
|
60 |
-
PUNCTUATION_LIST = "".join(PUNCTUATION_LIST)
|
|
|
61 |
|
62 |
|
63 |
def clean_directory() -> None:
|
|
|
40 |
MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
|
41 |
MAPPING_ANONYMIZED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
|
42 |
MAPPING_ENCRYPTED_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_encrypted.pkl"
|
43 |
+
MAPPING_DOC_EMBEDDING_PATH = DATA_PATH / "mapping_doc_embedding_path.pkl"
|
44 |
+
|
45 |
PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
|
46 |
|
47 |
|
|
|
59 |
PUNCTUATION_LIST = list(string.punctuation)
|
60 |
PUNCTUATION_LIST.remove("%")
|
61 |
PUNCTUATION_LIST.remove("$")
|
62 |
+
PUNCTUATION_LIST = "".join(PUNCTUATION_LIST) + '°'
|
63 |
+
print(f'{PUNCTUATION_LIST=}')
|
64 |
|
65 |
|
66 |
def clean_directory() -> None:
|