encrypted-anonymization

Sleeping

App Files Files Community

kcelia commited on Mar 30, 2024

Commit

67fa189

unverified ·

1 Parent(s): cf6aebf

chore: update with marketing comments + add unicorn server

Browse files

Files changed (4) hide show

app.py +300 -76
files/original_document_uuid_mapping.json +19 -1
server.py +105 -0
utils_demo.py +39 -14

app.py CHANGED Viewed

@@ -1,33 +1,48 @@
 """A Gradio app for anonymizing text data using FHE."""
 import os
 import re
 from typing import Dict, List
-import numpy
 import gradio as gr
 import pandas as pd
 from fhe_anonymizer import FHEAnonymizer
 from openai import OpenAI
 from utils_demo import *
 from concrete.ml.deployment import FHEModelClient
-ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
-ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
-MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
 subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
 time.sleep(3)
-clean_directory()
-anonymizer = FHEAnonymizer()
-client = OpenAI(api_key=os.environ.get("openaikey"))
-# Generate a random user ID
-user_id = numpy.random.randint(0, 2**32)
-print(f"Your user ID is: {user_id}....")
 def select_static_sentences_fn(selected_sentences: List):
@@ -41,14 +56,14 @@ def select_static_sentences_fn(selected_sentences: List):
 def key_gen_fn() -> Dict:
-    """Generate keys for a given user.
-    Returns:
-        dict: A dictionary containing the generated keys and related information.
-    """
-    print("Step 1: Key Generation:")
-    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
     client.load()
     # Creates the private and evaluation keys on the client side
@@ -59,10 +74,9 @@ def key_gen_fn() -> Dict:
     assert isinstance(serialized_evaluation_keys, bytes)
     # Save the evaluation key
-    evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key"
-    with evaluation_key_path.open("wb") as f:
-        f.write(serialized_evaluation_keys)
     # anonymizer.generate_key()
@@ -73,39 +87,43 @@ def key_gen_fn() -> Dict:
         print(error_message)
         return {gen_key_btn: gr.update(value=error_message)}
     else:
         return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
 def encrypt_query_fn(query):
-    print(f"Step 2 Query encryption: {query=}")
-    evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key"
-    if not evaluation_key_path.is_file():
-        error_message = "Error ❌: Please generate the key first!"
-        return {output_encrypted_box: gr.update(value=error_message)}
     if is_user_query_valid(query):
-        error_msg = (
-            "Unable to process ❌: The request exceeds the length limit or falls "
-            "outside the scope of this document. Please refine your query."
-        )
-        print(error_msg)
-        return {query_box: gr.update(value=error_msg)}
     # Retrieve the client API
-    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}")
     client.load()
     # Pattern to identify words and non-words (including punctuation, spaces, etc.)
     tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
-    encrypted_tokens = []
     for token in tokens:
         if bool(re.match(r"^\s+$", token)):
             continue
-        # Directly append non-word tokens or whitespace to processed_tokens
         # Prediction for each word
         emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
@@ -114,36 +132,220 @@ def encrypt_query_fn(query):
         encrypted_tokens.append(encrypted_x)
-    write_pickle(KEYS_DIR / f"{user_id}/encrypted_input", encrypted_tokens)
-    #anonymizer.encrypt_query(query)
-    encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens]
-    return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))}
-def run_fhe_fn(query_box):
-    evaluation_key_path = KEYS_DIR / "evaluation_key"
     if not evaluation_key_path.is_file():
-        error_message = "Error ❌: Please generate the key first!"
         return {anonymized_text_output: gr.update(value=error_message)}
-    encryted_query_path = KEYS_DIR / "encrypted_quantized_query"
-    if not encryted_query_path.is_file():
-        error_message = "Error ❌: Please encrypt your query first!"
         return {anonymized_text_output: gr.update(value=error_message)}
-    anonymizer.run_server_and_decrypt_output(query_box)
-    anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence")
-    # Removing Spaces Before Punctuation:
-    anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", anonymized_text)
-    identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob")
     # Convert the list of identified words and probabilities into a DataFrame
     if identified_words_with_prob:
@@ -152,9 +354,30 @@ def run_fhe_fn(query_box):
         )
     else:
         identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
     return anonymized_text, identified_df
 def query_chatgpt_fn(anonymized_query, anonymized_document):
     evaluation_key_path = KEYS_DIR / "evaluation_key"
@@ -250,7 +473,7 @@ with demo:
     with gr.Accordion("What is encrypted anonymization?", open=False):
         gr.Markdown(
-        """
         Anonymization is the process of removing personally identifiable information (PII)
         from data to protect individual privacy.
@@ -268,7 +491,6 @@ with demo:
     gr.Markdown(
         "## Step 1: Key generation\n\n"
         """In FHE schemes, two sets of keys are generated. First, the secret keys which are used for
         encrypting and decrypting data owned by the client. Second, the evaluation keys that allow
         a server to blindly process the encrypted data.
@@ -297,7 +519,7 @@ with demo:
                 """
             )
         with gr.Column():
-            gr.Markdown("**Anonymized document:**")
             gr.Markdown(
                 """You can see below the anonymized text, replaced with hexademical strings, that
                 will be sent to ChatGPT.
@@ -309,12 +531,14 @@ with demo:
     with gr.Row():
         with gr.Column():
             original_sentences_box = gr.CheckboxGroup(
-                 ORIGINAL_DOCUMENT, value=ORIGINAL_DOCUMENT, show_label=False,
             )
         with gr.Column():
-            anonymized_doc_box = gr.Textbox(show_label=False,
-                value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
             )
     original_sentences_box.change(
@@ -357,27 +581,16 @@ with demo:
             )
         with gr.Column(scale=1, min_width=6):
-            gr.HTML("<div style='height: 25px;'></div>")
-            gr.Markdown(
-                """
-                <p align="center">
-                Encrypt the query locally with FHE
-                </p>
-                """
-            )
-            encrypt_btn = gr.Button("Encrypt query”")
-            gr.HTML("<div style='height: 25px;'></div>")
         with gr.Column(scale=5):
             output_encrypted_box = gr.Textbox(
-                label="Encrypted anonymized query that will be sent to the anonymization server:", lines=8
             )
-    encrypt_btn.click(
-        fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box]
-    )
     ########################## FHE processing Part ##########################
     gr.Markdown("<hr />")
@@ -395,12 +608,23 @@ with demo:
         label="Decrypted anonymized query that will be sent to ChatGPT:", lines=1, interactive=True
     )
-    identified_words_output = gr.Dataframe(label="Identified words:", visible=False)
     run_fhe_btn.click(
-        run_fhe_fn,
         inputs=[query_box],
-        outputs=[anonymized_text_output, identified_words_output],
     )
     ########################## ChatGpt Part ##########################

 """A Gradio app for anonymizing text data using FHE."""
+import base64
 import os
 import re
+import subprocess
+import time
+import uuid
 from typing import Dict, List
 import gradio as gr
+import numpy
 import pandas as pd
+import requests
 from fhe_anonymizer import FHEAnonymizer
 from openai import OpenAI
 from utils_demo import *
 from concrete.ml.deployment import FHEModelClient
+# Ensure the directory is clean before starting processes or reading files
+clean_directory()
+anonymizer = FHEAnonymizer()
+client = OpenAI(api_key=os.environ.get("openaikey"))
+# Start the Uvicorn server hosting the FastAPI app
 subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR)
 time.sleep(3)
+# Load data from files required for the application
+UUID_MAP = read_json(MAPPING_UUID_PATH)
+ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH)
+MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH)
+ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n")
+# 4. Data Processing and Operations (No specific operations shown here, assuming it's part of anonymizer or client usage)
+# 5. Utilizing External Services or APIs
+# (Assuming client initialization and anonymizer setup are parts of using external services or application-specific logic)
+# Generate a random user ID for this session
+USER_ID = numpy.random.randint(0, 2**32)
 def select_static_sentences_fn(selected_sentences: List):
 def key_gen_fn() -> Dict:
+    """Generate keys for a given user."""
+    print("------------ Step 1: Key Generation:")
+    print(f"Your user ID is: {USER_ID}....")
+    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
     # Creates the private and evaluation keys on the client side
     assert isinstance(serialized_evaluation_keys, bytes)
     # Save the evaluation key
+    evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
+    write_bytes(evaluation_key_path, serialized_evaluation_keys)
     # anonymizer.generate_key()
         print(error_message)
         return {gen_key_btn: gr.update(value=error_message)}
     else:
+        print("Keys have been generated ✅")
         return {gen_key_btn: gr.update(value="Keys have been generated ✅")}
 def encrypt_query_fn(query):
+    print(f"\n------------ Step 2: Query encryption: {query=}")
+    if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
+        return {output_encrypted_box: gr.update(value="Error ❌: Please generate the key first!")}
     if is_user_query_valid(query):
+        return {
+            query_box: gr.update(
+                value=(
+                    "Unable to process ❌: The request exceeds the length limit or falls "
+                    "outside the scope of this document. Please refine your query."
+                )
+            )
+        }
     # Retrieve the client API
+    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
     client.load()
+    encrypted_tokens = []
     # Pattern to identify words and non-words (including punctuation, spaces, etc.)
     tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query)
     for token in tokens:
+        # 1- Ignore non-words tokens
         if bool(re.match(r"^\s+$", token)):
             continue
+        # 2- Directly append non-word tokens or whitespace to processed_tokens
         # Prediction for each word
         emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER)
         encrypted_tokens.append(encrypted_x)
+    print(f"Data encrypted ✅ on Client Side")
+    assert len({len(token) for token in encrypted_tokens}) == 1
+    write_bytes(KEYS_DIR / f"{USER_ID}/encrypted_input", b"".join(encrypted_tokens))
+    write_bytes(
+        KEYS_DIR / f"{USER_ID}/encrypted_input_len", len(encrypted_tokens[0]).to_bytes(10, "big")
+    )
+    encrypted_quant_tokens_hex = [token.hex()[500:675] for token in encrypted_tokens]
+    return {
+        output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex)),
+        anonymized_text_output: gr.update(visible=True, value=None),
+        identified_words_output_df: gr.update(visible=False, value=None),
+    }
+def send_input_fn(query) -> Dict:
+    """Send the encrypted data and the evaluation key to the server."""
+    print("------------ Step 3.1: Send encrypted_data to the Server")
+    evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
+    encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
+    encrypted_input_len_path = KEYS_DIR / f"{USER_ID}/encrypted_input_len"
     if not evaluation_key_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
+        )
         return {anonymized_text_output: gr.update(value=error_message)}
+    if not encrypted_input_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: The data has not been encrypted "
+            f"correctly on the client side - {encrypted_input_path.is_file()=}"
+        )
         return {anonymized_text_output: gr.update(value=error_message)}
+    # Define the data and files to post
+    data = {"user_id": USER_ID, "input": query}
+    files = [
+        ("files", open(evaluation_key_path, "rb")),
+        ("files", open(encrypted_input_path, "rb")),
+        ("files", open(encrypted_input_len_path, "rb")),
+    ]
+    # Send the encrypted input and evaluation key to the server
+    url = SERVER_URL + "send_input"
+    with requests.post(
+        url=url,
+        data=data,
+        files=files,
+    ) as resp:
+        print("Data sent to the server ✅" if resp.ok else "Error ❌ in sending data to the server")
+def run_fhe_in_server_fn() -> Dict:
+    """Run in FHE the anonymization of the query"""
+    print("------------ Step 3.2: Run in FHE on the Server Side")
+    evaluation_key_path = KEYS_DIR / f"{USER_ID}/evaluation_key"
+    encrypted_input_path = KEYS_DIR / f"{USER_ID}/encrypted_input"
+    if not evaluation_key_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            f"The key has been generated correctly - {evaluation_key_path.is_file()=}"
+        )
+        return {anonymized_text_output: gr.update(value=error_message)}
+    if not encrypted_input_path.is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: The data has not been encrypted "
+            f"correctly on the client side - {encrypted_input_path.is_file()=}"
+        )
+        return {anonymized_text_output: gr.update(value=error_message)}
+    data = {
+        "user_id": USER_ID,
+    }
+    url = SERVER_URL + "run_fhe"
+    with requests.post(
+        url=url,
+        data=data,
+    ) as response:
+        if not response.ok:
+            return {
+                anonymized_text_output: gr.update(
+                    value=(
+                        "⚠️ An error occurred on the Server Side. "
+                        "Please check connectivity and data transmission."
+                    ),
+                ),
+            }
+        else:
+            time.sleep(1)
+            print(f"The query anonymization was computed in {response.json():.2f} s per token.")
+def get_output_fn() -> Dict:
+    print("------------ Step 3.3: Get the output from the Server Side")
+    if not (KEYS_DIR / f"{USER_ID}/evaluation_key").is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            "The key has not been generated correctly"
+        )
+        return {anonymized_text_output: gr.update(value=error_message)}
+    if not (KEYS_DIR / f"{USER_ID}/encrypted_input").is_file():
+        error_message = (
+            "Error Encountered While Sending Data to the Server: "
+            "The data has not been encrypted correctly on the client side"
+        )
+        return {anonymized_text_output: gr.update(value=error_message)}
+    data = {
+        "user_id": USER_ID,
+    }
+    # Retrieve the encrypted output
+    url = SERVER_URL + "get_output"
+    with requests.post(
+        url=url,
+        data=data,
+    ) as response:
+        if response.ok:
+            print("Data received ✅ from the remote Server")
+            response_data = response.json()
+            encrypted_output_base64 = response_data["encrypted_output"]
+            length_encrypted_output_base64 = response_data["length"]
+            # Decode the base64 encoded data
+            encrypted_output = base64.b64decode(encrypted_output_base64)
+            length_encrypted_output = base64.b64decode(length_encrypted_output_base64)
+            # Save the encrypted output to bytes in a file as it is too large to pass through
+            # regular Gradio buttons (see https://github.com/gradio-app/gradio/issues/1877)
+            write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output", encrypted_output)
+            write_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len", length_encrypted_output)
+        else:
+            print("Error ❌ in getting data to the server")
+def decrypt_fn(text) -> Dict:
+    """Dencrypt the data on the `Client Side`."""
+    print("------------ Step 4: Dencrypt the data on the `Client Side`")
+    # Get the encrypted output path
+    encrypted_output_path = CLIENT_DIR / f"{USER_ID}_encrypted_output"
+    if not encrypted_output_path.is_file():
+        error_message = """⚠️ Please ensure that: \n
+                - the connectivity \n
+                - the query has been submitted \n
+                - the evaluation key has been generated \n
+                - the server processed the encrypted data \n
+                - the Client received the data from the Server before decrypting the prediction
+                """
+        print(error_message)
+        return error_message, None
+    # Retrieve the client API
+    client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{USER_ID}")
+    client.load()
+    # Load the encrypted output as bytes
+    encrypted_output = read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output")
+    length = int.from_bytes(read_bytes(CLIENT_DIR / f"{USER_ID}_encrypted_output_len"), "big")
+    tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", text)
+    decrypted_output, identified_words_with_prob = [], []
+    i = 0
+    for token in tokens:
+        # Directly append non-word tokens or whitespace to processed_tokens
+        if bool(re.match(r"^\s+$", token)):
+            continue
+        else:
+            encrypted_token = encrypted_output[i : i + length]
+            prediction_proba = client.deserialize_decrypt_dequantize(encrypted_token)
+            probability = prediction_proba[0][1]
+            i += length
+            if probability >= 0.77:
+                identified_words_with_prob.append((token, probability))
+                # Use the existing UUID if available, otherwise generate a new one
+                tmp_uuid = UUID_MAP.get(token, str(uuid.uuid4())[:8])
+                decrypted_output.append(tmp_uuid)
+                UUID_MAP[token] = tmp_uuid
+            else:
+                decrypted_output.append(token)
+        # Update the UUID map with query.
+        write_json(MAPPING_UUID_PATH, UUID_MAP)
+    # Removing Spaces Before Punctuation:
+    anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", " ".join(decrypted_output))
     # Convert the list of identified words and probabilities into a DataFrame
     if identified_words_with_prob:
         )
     else:
         identified_df = pd.DataFrame(columns=["Identified Words", "Probability"])
+    print(f"Decryption done ✅ on Client Side")
     return anonymized_text, identified_df
+def anonymization_with_fn(query):
+    encrypt_query_fn(query)
+    send_input_fn(query)
+    run_fhe_in_server_fn()
+    get_output_fn()
+    anonymized_text, identified_df = decrypt_fn(query)
+    return {
+        anonymized_text_output: gr.update(value=anonymized_text),
+        identified_words_output_df: gr.update(value=identified_df, visible=True),
+    }
 def query_chatgpt_fn(anonymized_query, anonymized_document):
     evaluation_key_path = KEYS_DIR / "evaluation_key"
     with gr.Accordion("What is encrypted anonymization?", open=False):
         gr.Markdown(
+            """
         Anonymization is the process of removing personally identifiable information (PII)
         from data to protect individual privacy.
     gr.Markdown(
         "## Step 1: Key generation\n\n"
         """In FHE schemes, two sets of keys are generated. First, the secret keys which are used for
         encrypting and decrypting data owned by the client. Second, the evaluation keys that allow
         a server to blindly process the encrypted data.
                 """
             )
         with gr.Column():
+            gr.Markdown("**Anonymized document:**")
             gr.Markdown(
                 """You can see below the anonymized text, replaced with hexademical strings, that
                 will be sent to ChatGPT.
     with gr.Row():
         with gr.Column():
             original_sentences_box = gr.CheckboxGroup(
+                ORIGINAL_DOCUMENT,
+                value=ORIGINAL_DOCUMENT,
+                show_label=False,
             )
         with gr.Column():
+            anonymized_doc_box = gr.Textbox(
+                show_label=False, value=ANONYMIZED_DOCUMENT, interactive=False, lines=11
             )
     original_sentences_box.change(
             )
         with gr.Column(scale=1, min_width=6):
+            gr.HTML("<div style='height: 77px;'></div>")
+            encrypt_btn = gr.Button("Encrypt query")
+            # gr.HTML("<div style='height: 50px;'></div>")
         with gr.Column(scale=5):
             output_encrypted_box = gr.Textbox(
+                label="Encrypted anonymized query that will be sent to the anonymization server:",
+                lines=8,
             )
     ########################## FHE processing Part ##########################
     gr.Markdown("<hr />")
         label="Decrypted anonymized query that will be sent to ChatGPT:", lines=1, interactive=True
     )
+    identified_words_output_df = gr.Dataframe(label="Identified words:", visible=False)
+    encrypt_btn.click(
+        fn=encrypt_query_fn,
+        inputs=[query_box],
+        outputs=[
+            query_box,
+            output_encrypted_box,
+            anonymized_text_output,
+            identified_words_output_df,
+        ],
+    )
     run_fhe_btn.click(
+        anonymization_with_fn,
         inputs=[query_box],
+        outputs=[anonymized_text_output, identified_words_output_df],
     )
     ########################## ChatGpt Part ##########################

files/original_document_uuid_mapping.json CHANGED Viewed

	@@ -1 +1,19 @@
1	- {"078-05-1126": "d8da62f1", "1234567A": "5e63c327", "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "ac41d58b", "191280342": "59a83e41", "192.168.0.1": "116fe81e", "212": "144a2acc", "4095-2609-9393-4932": "e5b499b0", "555-1234": "d9e5704e", "954567876544": "9eb07461", "David": "ebe99761", "IL150120690000003111111": "5ca977a4", "International": "71d0f51c", "Johnson": "53a9291d", "Kate": "b474d794", "Maine": "6337f12f", "microsoft.com": "0d574451", "test@presidio.site": "1f78e797"}

+{
+    "078-05-1126": "d8da62f1",
+    "1234567A": "5e63c327",
+    "16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ": "ac41d58b",
+    "191280342": "59a83e41",
+    "192.168.0.1": "116fe81e",
+    "212": "144a2acc",
+    "4095-2609-9393-4932": "e5b499b0",
+    "555-1234": "d9e5704e",
+    "954567876544": "9eb07461",
+    "David": "ebe99761",
+    "IL150120690000003111111": "5ca977a4",
+    "International": "71d0f51c",
+    "Johnson": "53a9291d",
+    "Kate": "b474d794",
+    "Maine": "6337f12f",
+    "microsoft.com": "0d574451",
+    "test@presidio.site": "1f78e797"
+}

server.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Server that will listen for GET and POST requests from the client."""
+import base64
+import time
+from typing import List
+import numpy
+from fastapi import FastAPI, File, Form, UploadFile
+from fastapi.responses import JSONResponse
+from utils_demo import *
+from utils_demo import SERVER_DIR
+from concrete.ml.deployment import FHEModelServer
+# Load the FHE server
+FHE_SERVER = FHEModelServer(DEPLOYMENT_DIR)
+# Initialize an instance of FastAPI
+app = FastAPI()
+# Define the default route
+@app.get("/")
+def root():
+    """
+    Root endpoint of the health prediction API.
+    Returns:
+        dict: The welcome message.
+    """
+    return {"message": "Welcome to your encrypted anonymization use-case with FHE!"}
+@app.post("/send_input")
+def send_input(
+    user_id: str = Form(),
+    files: List[UploadFile] = File(),
+):
+    """Send the inputs to the server."""
+    # Save the files using the above paths
+    write_bytes(SERVER_DIR / f"{user_id}_valuation_key", files[0].file.read())
+    write_bytes(SERVER_DIR / f"{user_id}_encrypted_input", files[1].file.read())
+    write_bytes(SERVER_DIR / f"{user_id}_encrypted_len_input", files[2].file.read())
+@app.post("/run_fhe")
+def run_fhe(
+    user_id: str = Form(),
+):
+    """Inference in FHE."""
+    evaluation_key_path = SERVER_DIR / f"{user_id}_valuation_key"
+    encrypted_input_path = SERVER_DIR / f"{user_id}_encrypted_input"
+    encrypted_input_len_path = SERVER_DIR / f"{user_id}_encrypted_len_input"
+    # Read the files (Evaluation key + Encrypted symptoms) using the above paths
+    with encrypted_input_path.open("rb") as encrypted_output_file, evaluation_key_path.open(
+        "rb"
+    ) as evaluation_key_file, encrypted_input_len_path.open("rb") as lenght:
+        evaluation_key = evaluation_key_file.read()
+        encrypted_tokens = encrypted_output_file.read()
+        length = int.from_bytes(lenght.read(), "big")
+    timing, encrypted_output = [], []
+    for i in range(0, len(encrypted_tokens), length):
+        enc_x = encrypted_tokens[i : i + length]
+        start_time = time.time()
+        enc_y = FHE_SERVER.run(enc_x, evaluation_key)
+        timing.append(round(time.time() - start_time, 2))
+        encrypted_output.append(enc_y)
+    # Write the files
+    write_bytes(SERVER_DIR / f"{user_id}_encrypted_output", b"".join(encrypted_output))
+    write_bytes(
+        SERVER_DIR / f"{user_id}_encrypted_output_len", len(encrypted_output[0]).to_bytes(10, "big")
+    )
+    return JSONResponse(content=numpy.mean(timing))
+@app.post("/get_output")
+def get_output(user_id: str = Form()):
+    """Retrieve the encrypted output from the server."""
+    # Path where the encrypted output is saved
+    encrypted_output_path = SERVER_DIR / f"{user_id}_encrypted_output"
+    encrypted_output_len_path = SERVER_DIR / f"{user_id}_encrypted_output_len"
+    # Read the file using the above path
+    with encrypted_output_path.open("rb") as f:
+        encrypted_output = f.read()
+    # Read the file using the above path
+    with encrypted_output_len_path.open("rb") as f:
+        length = f.read()
+    time.sleep(1)
+    # Encode the binary data to a format suitable for JSON serialization
+    content = {
+        "encrypted_output": base64.b64encode(encrypted_output).decode("utf-8"),
+        "length": base64.b64encode(length).decode("utf-8"),
+    }
+    # Send the encrypted output
+    return JSONResponse(content)

utils_demo.py CHANGED Viewed

@@ -6,38 +6,51 @@ import shutil
 import string
 from collections import Counter
 from pathlib import Path
-from transformers import AutoModel, AutoTokenizer
 import numpy as np
 import torch
-MAX_USER_QUERY_LEN = 80
-# List of example queries for easy access
-DEFAULT_QUERIES = {
-    "Example Query 1": "Who visited microsoft.com on September 18?",
-    "Example Query 2": "Does Kate have a driving licence?",
-    "Example Query 3": "What's David Johnson's phone number?",
-}
-CURRENT_DIR = Path(__file__).parent
-DATA_PATH = CURRENT_DIR / "files"
-LOGREG_MODEL_PATH = CURRENT_DIR / "models" / "cml_logreg.model"
 DEPLOYMENT_DIR = CURRENT_DIR / "deployment"
-KEYS_DIR = DEPLOYMENT_DIR / "fhe_keys"
 ORIGINAL_FILE_PATH = DATA_PATH / "original_document.txt"
 ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
 MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
 MAPPING_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
 PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
-ALL_DIRS = [KEYS_DIR]
 # Load tokenizer and model
-TOKENIZER =  AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
 EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
 PUNCTUATION_LIST = list(string.punctuation)
@@ -163,3 +176,15 @@ def write_json(file_name, data):
     """Save data to a json file."""
     with open(file_name, "w", encoding="utf-8") as file:
         json.dump(data, file, indent=4, sort_keys=True)

 import string
 from collections import Counter
 from pathlib import Path
 import numpy as np
 import torch
+from transformers import AutoModel, AutoTokenizer
+from pathlib import Path
+# Core Application URL
+SERVER_URL = "http://localhost:8000/"
+# Maximum length for user queries
+MAX_USER_QUERY_LEN = 80
+# Base Directories
+CURRENT_DIR = Path(__file__).parent
 DEPLOYMENT_DIR = CURRENT_DIR / "deployment"
+DATA_PATH = CURRENT_DIR / "files"
+# Deployment Directories
+CLIENT_DIR = DEPLOYMENT_DIR / "client_dir"
+SERVER_DIR = DEPLOYMENT_DIR / "server_dir"
+KEYS_DIR = DEPLOYMENT_DIR / ".fhe_keys"
+# All Directories
+ALL_DIRS = [KEYS_DIR, CLIENT_DIR, SERVER_DIR]
+# Model and Data Files
+LOGREG_MODEL_PATH = CURRENT_DIR / "models" / "cml_logreg.model"
 ORIGINAL_FILE_PATH = DATA_PATH / "original_document.txt"
 ANONYMIZED_FILE_PATH = DATA_PATH / "anonymized_document.txt"
 MAPPING_UUID_PATH = DATA_PATH / "original_document_uuid_mapping.json"
 MAPPING_SENTENCES_PATH = DATA_PATH / "mapping_clear_to_anonymized.pkl"
 PROMPT_PATH = DATA_PATH / "chatgpt_prompt.txt"
+# List of example queries for easy access
+DEFAULT_QUERIES = {
+    "Example Query 1": "Who visited microsoft.com on September 18?",
+    "Example Query 2": "Does Kate have a driving licence?",
+    "Example Query 3": "What's David Johnson's phone number?",
+}
 # Load tokenizer and model
+TOKENIZER = AutoTokenizer.from_pretrained("obi/deid_roberta_i2b2")
 EMBEDDINGS_MODEL = AutoModel.from_pretrained("obi/deid_roberta_i2b2")
 PUNCTUATION_LIST = list(string.punctuation)
     """Save data to a json file."""
     with open(file_name, "w", encoding="utf-8") as file:
         json.dump(data, file, indent=4, sort_keys=True)
+def write_bytes(path, data):
+    """Save binary data."""
+    with path.open("wb") as f:
+        f.write(data)
+def read_bytes(path):
+    """Load data from a binary file."""
+    with path.open("rb") as f:
+        return f.read()