ClairVault / train_FHE.py
VaultChem's picture
Upload 3 files
6c570a1 verified
import os
import pandas as pd
import numpy
import pickle
import pefile
import sklearn.ensemble as ek
from sklearn.feature_selection import SelectFromModel
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import svm
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split
import pdb
from tqdm import tqdm
dataset = pd.read_csv("data.csv", sep="|")
# Feature
X = dataset.drop(
["Name", "md5", "legitimate"], axis=1
).values # Droping this because classification model will not accept object type elements (float and int only)
# Target variable
ugly = [
"Machine",
"SizeOfOptionalHeader",
"Characteristics",
"MajorLinkerVersion",
"MinorLinkerVersion",
"SizeOfCode",
"SizeOfInitializedData",
"SizeOfUninitializedData",
"AddressOfEntryPoint",
"BaseOfCode",
"BaseOfData",
"ImageBase",
"SectionAlignment",
"FileAlignment",
"MajorOperatingSystemVersion",
"MinorOperatingSystemVersion",
"MajorImageVersion",
"MinorImageVersion",
"MajorSubsystemVersion",
"MinorSubsystemVersion",
"SizeOfImage",
"SizeOfHeaders",
"CheckSum",
"Subsystem",
"DllCharacteristics",
"SizeOfStackReserve",
"SizeOfStackCommit",
"SizeOfHeapReserve",
"SizeOfHeapCommit",
"LoaderFlags",
"NumberOfRvaAndSizes",
"SectionsNb",
"SectionsMeanEntropy",
"SectionsMinEntropy",
"SectionsMaxEntropy",
"SectionsMeanRawsize",
"SectionsMinRawsize",
#"SectionsMaxRawsize",
"SectionsMeanVirtualsize",
"SectionsMinVirtualsize",
"SectionMaxVirtualsize",
"ImportsNbDLL",
"ImportsNb",
"ImportsNbOrdinal",
"ExportNb",
"ResourcesNb",
"ResourcesMeanEntropy",
"ResourcesMinEntropy",
"ResourcesMaxEntropy",
"ResourcesMeanSize",
"ResourcesMinSize",
"ResourcesMaxSize",
"LoadConfigurationSize",
"VersionInformationSize",
]
X = dataset[ugly].values
y = dataset["legitimate"].values
extratrees = ek.ExtraTreesClassifier().fit(X[:1000], y[:1000])
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
# splitting the data (70% - training and 30% - testing)
X_train, X_test, y_train, y_test = train_test_split(
X_new, y, test_size=0.29, stratify=y
)
features = []
index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
for f in range(nbfeatures):
print(
"%d. feature %s (%f)"
% (
f + 1,
dataset.columns[2 + index[f]],
extratrees.feature_importances_[index[f]],
)
)
features.append(dataset.columns[2 + f])
model = {
"DecisionTree": DecisionTreeClassifier(max_depth=10),
"RandomForest": ek.RandomForestClassifier(n_estimators=50),
}
results = {}
for algo in model:
clf = model[algo]
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("%s : %s " % (algo, score))
results[algo] = score
winner = max(results, key=results.get) # Selecting the classifier with good result
print("Using", winner, "for classification, with", len(features), "features.")
joblib.dump(model[winner], "classifier.pkl")
open("features.pkl", "wb").write(pickle.dumps(features))
from fhe_utils import (
client_server_interaction, train_zama,
setup_network,
copy_directory,
setup_client,
)
model_dev_fhe = train_zama(X_train, y_train)
#pdb.set_trace()
network, _ = setup_network(model_dev_fhe)
copied, error_message = copy_directory(network.dev_dir.name, destination="fhe_model")
if not copied:
print(f"Error copying directory: {error_message}")
network.dev_send_model_to_server()
network.dev_send_clientspecs_and_modelspecs_to_client()
fhemodel_client, serialized_evaluation_keys = setup_client(
network, network.client_dir.name
)
print(f"Evaluation keys size: {len(serialized_evaluation_keys)} B")
network.client_send_evaluation_key_to_server(serialized_evaluation_keys)
decrypted_predictions, execution_time = client_server_interaction(network, fhemodel_client, X_test[:100])