File size: 16,924 Bytes
e07b55c
dd8b714
 
 
 
 
 
 
 
 
 
 
 
 
 
be74cf1
31692b0
dd8b714
 
48f1c5b
c61a090
 
a8246da
dd8b714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2b27752
 
75c6c70
 
dfd9163
 
dd8b714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31692b0
dd8b714
 
 
 
 
2b7a723
88ea699
dd8b714
 
 
 
 
 
 
 
31692b0
 
dd8b714
b59defd
be74cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd8b714
 
be74cf1
 
 
60be4f0
 
2920351
be74cf1
 
 
 
dd8b714
d1b5444
a3956a0
31692b0
88ea699
c6147ee
d6bafb7
27f0f55
0cc3b77
dd8b714
0cc3b77
dd8b714
0cc3b77
dd8b714
 
bc231b1
dd8b714
 
 
 
be74cf1
 
091902f
dd8b714
31692b0
dd8b714
 
 
 
 
 
 
be74cf1
 
dd8b714
 
 
 
 
 
 
 
e9bd8c0
dd8b714
 
 
 
 
 
 
be74cf1
 
 
 
dd8b714
 
31692b0
dd8b714
 
 
 
 
 
 
 
 
 
 
 
 
44e0483
493da40
44e0483
493da40
44e0483
493da40
e07b55c
 
dd8b714
106750c
 
 
60be4f0
106750c
 
60be4f0
 
 
 
106750c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3956a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106750c
a3956a0
 
 
 
 
 
106750c
 
a3956a0
 
 
 
 
 
 
 
 
 
 
 
 
 
106750c
a3956a0
 
 
 
 
 
 
 
 
 
106750c
a3956a0
 
 
 
 
 
 
 
 
 
 
106750c
 
60be4f0
a3956a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4af447
a3956a0
 
 
 
 
9a91a20
a3956a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31692b0
a3956a0
ba74a50
a3956a0
31692b0
a3956a0
ba74a50
a3956a0
 
20140eb
b59defd
a3956a0
 
 
 
 
20140eb
31692b0
a3956a0
 
 
 
31692b0
a3956a0
 
31692b0
a3956a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
import gradio as gr
import subprocess
from concrete.ml.deployment import FHEModelClient
from requests import head
import numpy
import os
from pathlib import Path
import requests
import json
import base64
import shutil
import time
import pandas as pd
import pickle
import numpy as np
import pdb

# This repository's directory
REPO_DIR = Path(__file__).parent
subprocess.Popen(["uvicorn", "server:app"], cwd=REPO_DIR)


# subprocess.Popen(["uvicorn", "server:app", "--port", "3000"], cwd=REPO_DIR)

# if not exists, create a directory for the FHE keys called .fhe_keys
if not os.path.exists(".fhe_keys"):
    os.mkdir(".fhe_keys")
# if not exists, create a directory for the tmp files called tmp
if not os.path.exists("tmp"):
    os.mkdir("tmp")


# Wait 4 sec for the server to start
time.sleep(4)


# Encrypted data limit for the browser to display
# (encrypted data is too large to display in the browser)
ENCRYPTED_DATA_BROWSER_LIMIT = 500
N_USER_KEY_STORED = 20

#Evaluation Key
eval_key = []
#Encodings vector
encodings = []
#User ID
user_id = []

def clean_tmp_directory():
    # Allow 20 user keys to be stored.
    # Once that limitation is reached, deleted the oldest.
    path_sub_directories = sorted(
        [f for f in Path(".fhe_keys/").iterdir() if f.is_dir()], key=os.path.getmtime
    )

    user_ids = []
    if len(path_sub_directories) > N_USER_KEY_STORED:
        n_files_to_delete = len(path_sub_directories) - N_USER_KEY_STORED
        for p in path_sub_directories[:n_files_to_delete]:
            user_ids.append(p.name)
            shutil.rmtree(p)

    list_files_tmp = Path("tmp/").iterdir()
    # Delete all files related to user_id
    for file in list_files_tmp:
        for user_id in user_ids:
            if file.name.endswith(f"{user_id}.npy"):
                file.unlink()


def keygen(eval_key, user_id):
    # Clean tmp directory if needed
    clean_tmp_directory()

    print("Initializing FHEModelClient...")
    # Let's create a user_id
    user_id = numpy.random.randint(0, 2**32)
    fhe_api = FHEModelClient(f"fhe_model", f".fhe_keys/{user_id}")
    fhe_api.load()

    # Generate a fresh key
    fhe_api.generate_private_and_evaluation_keys(force=True)
    evaluation_key = fhe_api.get_serialized_evaluation_keys()

    numpy.save(f"tmp/tmp_evaluation_key_{user_id}.npy", evaluation_key)

    eval_key = list(evaluation_key)[:ENCRYPTED_DATA_BROWSER_LIMIT]
    return eval_key, user_id

def encode_quantize(test_file, eval_key, encodings):
    ugly = ['Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'ImportsNbOrdinal', 'ExportNb', 'ResourcesNb', 'ResourcesMeanEntropy',
       'ResourcesMinEntropy', 'ResourcesMaxEntropy', 'ResourcesMeanSize',
       'ResourcesMinSize', 'ResourcesMaxSize', 'LoadConfigurationSize',
       'VersionInformationSize']

    fhe_api = FHEModelClient(f"fhe_model", f".fhe_keys/{eval_key}")
    fhe_api.load()
    from PE_main import extract_infos
    # expect [1, 53] but we get (53)
    # pdb.set_trace()
    # features = pickle.loads(open(os.path.join("features.pkl"), "rb").read())

    
    encodings = extract_infos(test_file)

    encodings = list(map(lambda x: encodings[x], ugly))

    encodings = np.array(encodings).reshape(1, -1)

    return encodings

def encrypt_encoded_quantize(encodings, user_id, eval_key):
    fhe_api = FHEModelClient(f"fhe_model", f".fhe_keys/{user_id}")
    fhe_api.load()
    
    encodings = np.array(encodings)
    print(encodings+"/n")
    quantized_encodings = fhe_api.model.quantize_input(encodings).astype(numpy.uint8)
    print(quantized_encodings+"/n")
    encrypted_quantized_encoding = fhe_api.quantize_encrypt_serialize(encodings)
    print(encrypted_quantized_encoding+"/n")

    numpy.save(
        f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy",
        encrypted_quantized_encoding,
    )

    # Compute size
    encrypted_quantized_encoding_shorten = list(encrypted_quantized_encoding)[:ENCRYPTED_DATA_BROWSER_LIMIT]
    encrypted_quantized_encoding_shorten_hex = "".join(f"{i:02x}" for i in encrypted_quantized_encoding_shorten)
    return (encrypted_quantized_encoding_shorten_hex)

def run_fhe(user_id):
    encoded_data_path = Path(f"tmp/tmp_encrypted_quantized_encoding_{user_id}.npy")
    encrypted_quantized_encoding = numpy.load(encoded_data_path)

    # Read evaluation_key from the file
    evaluation_key = numpy.load(f"tmp/tmp_evaluation_key_{user_id}.npy")

    # Use base64 to encode the encodings and evaluation key
    encrypted_quantized_encoding = base64.b64encode(encrypted_quantized_encoding).decode()

    encoded_evaluation_key = base64.b64encode(evaluation_key).decode()

    query = {}
    query["evaluation_key"] = encoded_evaluation_key
    query["encrypted_encoding"] = encrypted_quantized_encoding
    headers = {"Content-type": "application/json"}

    response = requests.post(
        "http://localhost:8000/predict",
        data=json.dumps(query),
        headers=headers,
    )

    encrypted_prediction = base64.b64decode(response.json()["encrypted_prediction"])

    numpy.save(f"tmp/tmp_encrypted_prediction_{user_id}.npy", encrypted_prediction)

    encrypted_prediction_shorten = list(encrypted_prediction)[:ENCRYPTED_DATA_BROWSER_LIMIT]
    encrypted_prediction_shorten_hex = "".join(f"{i:02x}" for i in encrypted_prediction_shorten)
    return encrypted_prediction_shorten_hex


def decrypt_prediction(user_id):
    encoded_data_path = Path(f"tmp/tmp_encrypted_prediction_{user_id}.npy")

    # Read encrypted_prediction from the file

    encrypted_prediction = numpy.load(encoded_data_path).tobytes()

    fhe_api = FHEModelClient(f"fhe_model", f".fhe_keys/{user_id}")
    fhe_api.load()

    # We need to retrieve the private key that matches the client specs (see issue #18)
    fhe_api.generate_private_and_evaluation_keys(force=False)

    predictions = fhe_api.deserialize_decrypt_dequantize(encrypted_prediction)

    if(predictions[0][0] >= 0.5):
        return "Safe file"
    else:
        return "Malware"
    


if __name__ == "__main__":
    """
    with gr.Blocks() as demo:
        print("Starting the FHE Model")

        inputs = [gr.File(label="Test File")]
        outputs = [
            gr.Textbox(label="Evaluation Key"),
            gr.Textbox(label="Encodings"),
            gr.Textbox(label="Encrypted Quantized Encoding"),
            gr.Textbox(label="Encrypted Prediction"),
        ]

        run_bottn = gr.Button(label="Run")

        run_bottn.click(
            fn=process_pipeline,  # Pass process_pipeline directly
            inputs=inputs,
            outputs=outputs
        )

    demo.launch()


    """

    print("๐Ÿš€ Starting the ClairVault demo...")
    with gr.Blocks(css=".gradio-container { font-size: 20px; }") as demo:

        gr.Markdown(
            """
            <p align="center">
                <img width=200 src="/api/placeholder/200/200" alt="ClairVault Logo">
            </p>
            <h2 align="center">๐Ÿ”’ ClairVault: Privacy-Preserving Cloud-Based Malware Scanning</h2>
            <p align="center">
                <a href="#">GitHub</a>
                โ€”
                <a href="#">Documentation</a>
                โ€”
                <a href="#">Community</a>
                โ€”
                <a href="#">@ClairVault</a>
            </p>
            <p align="center">
            <img src="/api/placeholder/600/300" alt="ClairVault Concept" width="60%" height="60%">
            </p>
            """
        )

        gr.Markdown("## ๐Ÿ“‹ Executive Summary")
        gr.Markdown(
        """
        ### ClairVault is a **malware scanning service**, cloud-based and machine-learning enabled, that leverages fully homomorphic encryption (FHE) to securely scan files without seeing your data.
        """
        )

        gr.Markdown("## ๐Ÿšจ The Problem")
        gr.Markdown(
        """
        1. Privacy: do you really trust the Russian-based Kaspersky and that there is no back-door built in McAfee by Oncle Sam?
            - Require access to plaintext data, posing privacy and security risks
            - Often closed-source, lacking transparency
        2. Protection: 
            - The malware scanner itself might be compromised
            - Require frequent local updates
            - Mostly rule-based โ†’ need machine learning - but mostly on cloud
        3. Resources
            - Consume local resources (Windows Defender only runs ML based models on the cloud)
        """
        )

        gr.Markdown("## ๐Ÿ’ก Our Solution: ClairVault")
        gr.Markdown(
        """
        Key features include:
        - Local extraction of features and encryption of user data (files, logs)
        - Transmission of encrypted data to secure, open-source cloud servers
        - Malware classification performed on encrypted data
        - Return of encrypted classification results
        """
        )

        gr.Markdown("### ๐Ÿ› ๏ธ Technical Implementation")
        gr.Markdown(
        """
        - **Encryption Method**: Fully Homomorphic Encryption using the TFHE (Fast Fully Homomorphic Encryption over the Torus) library
        - **Machine Learning Model**: Linear classifier optimized for FHE computations using ConcreteML
        - **Performance Metrics**:
            - Encrypted file scanning: ~30 seconds per MB
            - Plaintext file scanning: ~0.5 seconds per MB
            - Estimated time to scan 1GB: ~8.5 hours (encrypted) vs. ~8.5 minutes (plaintext)
        *Note: These are preliminary figures based on our proof-of-concept. We aim to significantly improve performance in future iterations.*
        """
        )



        gr.Markdown("# ๐Ÿ—๏ธ Step 1: Generate the keys")

        b_gen_key = gr.Button("๐Ÿ”‘ Generate the keys and send public part to server")

        evaluation_key = gr.Textbox(
            label="Evaluation key (truncated):",
            max_lines=4,
            interactive=False,
        )

        gr.Markdown("# ๐Ÿ“ค Step 2: Upload a file for scanning")
        gr.Markdown("## Client side")
        gr.Markdown(
            "Upload a file you want to scan for malware. ClairVault will encrypt it locally before sending it to the cloud."
        )
        file_input = gr.File(label="Upload a file:", file_count = "single", value="./smallexe64.exe") 

        gr.Markdown("# ๐Ÿ“ฅ Step 3: Extract executable file features")

        b_extract = gr.Button("๐Ÿ“ฅ Extract features and save")

        extracted_vector = gr.JSON(
            label="Extracted vector:",
        )

        gr.Markdown("# ๐Ÿ”’ Step 4: Encrypt the file with the private key")
        b_encrypt_file = gr.Button(
            "๐Ÿ” Encrypt the file and send to server"
        )

        encrypted_file = gr.Textbox(
            label="Encrypted file content (truncated):",
            max_lines=4,
            interactive=False,
        )

        gr.Markdown("# ๐Ÿ–ฅ๏ธ Step 5: Run the FHE-based malware scan")
        gr.Markdown("## Server side")
        gr.Markdown(
            "The encrypted file is received by the server. Using the evaluation key and FHE, the server can perform the malware scan directly on the encrypted data. Once the scan is finished, the server returns the encrypted result to the client."
        )

        b_run_fhe_scan = gr.Button("๐Ÿ›ก๏ธ Run FHE-based malware scan")
        encrypted_scan_result = gr.Textbox(
            label="Encrypted scan result (truncated):",
            max_lines=4,
            interactive=False,
        )

        gr.Markdown("# ๐Ÿ”“ Step 6: Decrypt the scan result")
        gr.Markdown("## Client side")
        gr.Markdown(
            "The encrypted scan result is sent back to the client, who can finally decrypt it with their private key. Only the client is aware of the original file content and the scan result."
        )
        b_decrypt_result = gr.Button("๐Ÿ” Decrypt scan result")
        user_id_input = gr.Number(visible=False)
        scan_result = gr.Textbox(label="Scan Result:")
        eval_key_input = gr.Textbox(value=eval_key, visible=False)
        # Button for key generation
        b_gen_key.click(fn=keygen, inputs=[eval_key_input, user_id_input], outputs=[evaluation_key, user_id_input])

        encodings_input = gr.Textbox(value=encodings, visible=False)
        # Button to extract vector
        b_extract.click(
            fn=encode_quantize,
            inputs=[file_input, eval_key_input, encodings_input],
            outputs=[extracted_vector],
        )

        # Button to encrypt file
        b_encrypt_file.click(
            fn=encrypt_encoded_quantize,
            inputs=[extracted_vector, user_id_input, eval_key_input],
            outputs=[encrypted_file],
        )

        # Button to run FHE-based malware scan
        b_run_fhe_scan.click(fn=run_fhe, inputs=[user_id_input], outputs=[encrypted_scan_result])

        # Button to decrypt the scan result
        b_decrypt_result.click(fn=decrypt_prediction, inputs=[user_id_input], outputs=[scan_result])

        gr.Markdown(
            "ClairVault is built using advanced Fully Homomorphic Encryption techniques to ensure your data remains private and secure throughout the entire malware scanning process."
        )
        gr.Markdown("## ๐ŸŒ Market Opportunity")
        gr.Markdown(
        """
        The global cybersecurity market is valued at $60 billion in 2024 with an annual growth rate of 15% projected by Morgan Stanley. In 2023, there were 6 billion cyberattacks, a 10% increase from 2022.
        **Target Industries**: Finance, Healthcare, Government, Legal Services, Individuals 
        **Estimated TAM (Total Addressable Market) for Privacy-Preserving Malware Scanning**:
        While exact figures for malware scanning are not available, we estimate it to be approximately 10% of the total cybersecurity market, or $6 billion. (Kaspersky has 700 million of revenue, Norton 1.5 billion, McAfee 2.5 billion.)
        Based on the critical need in our target industries we estimate privacy-preserving to be 5-10% of that, approximately $300-600 million.
        """
        )

        gr.Markdown("## ๐Ÿ† Competitive Advantage")
        gr.Markdown(
        """
        Unlike traditional solutions:
        1. Complete data privacy through FHE
        2. Open-source transparency
        3. Cloud-based scanning without local resource consumption
        4. Immunity to local malware compromise
        """
        )

        gr.Markdown("## ๐Ÿš€ Go-To-Market Strategy")
        gr.Markdown("### ๐ŸŽฏ Target Customers")
        gr.Markdown(
        """
        1. Enterprise Clients in sensitive industries
        2. Cloud Service Providers
        3. Cybersecurity Firms 
        4. Privacy-conscious individuals
        """
        )

        gr.Markdown("### ๐Ÿ’ฐ Revenue Model")
        gr.Markdown(
        """
        1. Enterprise Licensing
        2. Tiered Subscription Plans
        3. API Access Fees
        """
        )

        gr.Markdown("### ๐Ÿ“ˆ Sales and Marketing Channels")
        gr.Markdown(
        """
        1. Direct Enterprise Sales
        2. Partnerships with cloud providers and cybersecurity firms
        3. Industry events and conferences
        4. Content marketing (whitepapers, case studies)
        """
        )

        gr.Markdown("## ๐Ÿ… Achievements and Roadmap")
        gr.Markdown("### ๐Ÿ† Current Achievements")
        gr.Markdown(
        """
        - Developed a proof-of-concept multiscanner using the TFHE library
        - Successfully demonstrated end-to-end process from local encryption to cloud scanning
        - Implemented a linear classifier for malware detection on encrypted data
        """
        )

        gr.Markdown("### ๐Ÿ”ฎ Future Development")
        gr.Markdown(
        """
        1. Develop real-time scanning capabilities
        2. Add behavior analysis through encrypted log processing 
        3. Expand to support a wider range of file types and encryption schemes
        """
        )

    demo.launch(share=False)