"""A Gradio app for anonymizing text data using FHE.""" import os import re from typing import Dict, List import numpy import gradio as gr import pandas as pd from fhe_anonymizer import FHEAnonymizer from openai import OpenAI from utils_demo import * from concrete.ml.deployment import FHEModelClient ORIGINAL_DOCUMENT = read_txt(ORIGINAL_FILE_PATH).split("\n\n") ANONYMIZED_DOCUMENT = read_txt(ANONYMIZED_FILE_PATH) MAPPING_SENTENCES = read_pickle(MAPPING_SENTENCES_PATH) subprocess.Popen(["uvicorn", "server:app"], cwd=CURRENT_DIR) time.sleep(3) clean_directory() anonymizer = FHEAnonymizer() client = OpenAI(api_key=os.environ.get("openaikey")) # Generate a random user ID user_id = numpy.random.randint(0, 2**32) print(f"Your user ID is: {user_id}....") def select_static_sentences_fn(selected_sentences: List): selected_sentences = [MAPPING_SENTENCES[sentence] for sentence in selected_sentences] anonymized_selected_sentence = sorted(selected_sentences, key=lambda x: x[0]) anonymized_selected_sentence = [sentence for _, sentence in anonymized_selected_sentence] return {anonymized_doc_box: gr.update(value="\n\n".join(anonymized_selected_sentence))} def key_gen_fn() -> Dict: """Generate keys for a given user. Returns: dict: A dictionary containing the generated keys and related information. """ print("Step 1: Key Generation:") client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}") client.load() # Creates the private and evaluation keys on the client side client.generate_private_and_evaluation_keys() # Get the serialized evaluation keys serialized_evaluation_keys = client.get_serialized_evaluation_keys() assert isinstance(serialized_evaluation_keys, bytes) # Save the evaluation key evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key" with evaluation_key_path.open("wb") as f: f.write(serialized_evaluation_keys) # anonymizer.generate_key() if not evaluation_key_path.is_file(): error_message = ( f"Error Encountered While generating the evaluation {evaluation_key_path.is_file()=}" ) print(error_message) return {gen_key_btn: gr.update(value=error_message)} else: return {gen_key_btn: gr.update(value="Keys have been generated ✅")} def encrypt_query_fn(query): print(f"Step 2 Query encryption: {query=}") evaluation_key_path = KEYS_DIR / f"{user_id}/evaluation_key" if not evaluation_key_path.is_file(): error_message = "Error ❌: Please generate the key first!" return {output_encrypted_box: gr.update(value=error_message)} if is_user_query_valid(query): error_msg = ( "Unable to process ❌: The request exceeds the length limit or falls " "outside the scope of this document. Please refine your query." ) print(error_msg) return {query_box: gr.update(value=error_msg)} # Retrieve the client API client = FHEModelClient(path_dir=DEPLOYMENT_DIR, key_dir=KEYS_DIR / f"{user_id}") client.load() # Pattern to identify words and non-words (including punctuation, spaces, etc.) tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", query) encrypted_tokens = [] for token in tokens: if bool(re.match(r"^\s+$", token)): continue # Directly append non-word tokens or whitespace to processed_tokens # Prediction for each word emb_x = get_batch_text_representation([token], EMBEDDINGS_MODEL, TOKENIZER) encrypted_x = client.quantize_encrypt_serialize(emb_x) assert isinstance(encrypted_x, bytes) encrypted_tokens.append(encrypted_x) write_pickle(KEYS_DIR / f"{user_id}/encrypted_input", encrypted_tokens) #anonymizer.encrypt_query(query) encrypted_quant_tokens_hex = [token.hex()[500:510] for token in encrypted_tokens] return {output_encrypted_box: gr.update(value=" ".join(encrypted_quant_tokens_hex))} def run_fhe_fn(query_box): evaluation_key_path = KEYS_DIR / "evaluation_key" if not evaluation_key_path.is_file(): error_message = "Error ❌: Please generate the key first!" return {anonymized_text_output: gr.update(value=error_message)} encryted_query_path = KEYS_DIR / "encrypted_quantized_query" if not encryted_query_path.is_file(): error_message = "Error ❌: Please encrypt your query first!" return {anonymized_text_output: gr.update(value=error_message)} anonymizer.run_server_and_decrypt_output(query_box) anonymized_text = read_pickle(KEYS_DIR / "reconstructed_sentence") # Removing Spaces Before Punctuation: anonymized_text = re.sub(r"\s([,.!?;:])", r"\1", anonymized_text) identified_words_with_prob = read_pickle(KEYS_DIR / "identified_words_with_prob") # Convert the list of identified words and probabilities into a DataFrame if identified_words_with_prob: identified_df = pd.DataFrame( identified_words_with_prob, columns=["Identified Words", "Probability"] ) else: identified_df = pd.DataFrame(columns=["Identified Words", "Probability"]) return anonymized_text, identified_df def query_chatgpt_fn(anonymized_query, anonymized_document): evaluation_key_path = KEYS_DIR / "evaluation_key" if not evaluation_key_path.is_file(): error_message = "Error ❌: Please generate the key first!" return {anonymized_text_output: gr.update(value=error_message)} encryted_query_path = KEYS_DIR / "encrypted_quantized_query" if not encryted_query_path.is_file(): error_message = "Error ❌: Please encrypt your query first!" return {anonymized_text_output: gr.update(value=error_message)} decrypted_query_path = KEYS_DIR / "reconstructed_sentence" if not decrypted_query_path.is_file(): error_message = "Error ❌: Please run the FHE computation first!" return {anonymized_text_output: gr.update(value=error_message)} prompt = read_txt(PROMPT_PATH) # Prepare prompt full_prompt = prompt + "\n" query = ( "Document content:\n```\n" + anonymized_document + "\n\n```" + "Query:\n```\n" + anonymized_query + "\n```" ) print(full_prompt) completion = client.chat.completions.create( model="gpt-4-1106-preview", # Replace with "gpt-4" if available messages=[ {"role": "system", "content": prompt}, {"role": "user", "content": query}, ], ) anonymized_response = completion.choices[0].message.content uuid_map = read_json(MAPPING_UUID_PATH) inverse_uuid_map = { v: k for k, v in uuid_map.items() } # TODO load the inverse mapping from disk for efficiency # Pattern to identify words and non-words (including punctuation, spaces, etc.) tokens = re.findall(r"(\b[\w\.\/\-@]+\b|[\s,.!?;:'\"-]+)", anonymized_response) processed_tokens = [] for token in tokens: # Directly append non-word tokens or whitespace to processed_tokens if not token.strip() or not re.match(r"\w+", token): processed_tokens.append(token) continue if token in inverse_uuid_map: processed_tokens.append(inverse_uuid_map[token]) else: processed_tokens.append(token) deanonymized_response = "".join(processed_tokens) return anonymized_response, deanonymized_response demo = gr.Blocks(css=".markdown-body { font-size: 18px; }") with demo: gr.Markdown( """
Concrete-ML
—
Documentation
—
Community
—
@zama_fhe
#
#
Encrypt the query locally with FHE
""" ) encrypt_btn = gr.Button("Encrypt query”") gr.HTML("") with gr.Column(scale=5): output_encrypted_box = gr.Textbox( label="Encrypted anonymized query that will be sent to the anonymization server:", lines=8 ) encrypt_btn.click( fn=encrypt_query_fn, inputs=[query_box], outputs=[query_box, output_encrypted_box] ) ########################## FHE processing Part ########################## gr.Markdown("