#!/usr/bin/env python # coding: utf-8 import numpy as np import pandas as pd import time import os.path # from sklearnex import patch_sklearn, unpatch_sklearn # patch_sklearn() from sklearn.cluster import KMeans from sklearn.cluster import MeanShift # values: a two-dimensional array, m number of n-dimensional vectors to be clustered; def modified_kmeans_cluster(values_to_cluster, threshold, k_start, n_clusters=None): if n_clusters is not None: kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0).fit(values_to_cluster) return kmeans.labels_ else: n_clusters = k_start n_values = len(values_to_cluster) assert n_values > 0 kmeans = KMeans(n_clusters=n_clusters, n_init="auto", random_state=0).fit(values_to_cluster) inertias = [kmeans.inertia_] while n_values > n_clusters: n_clusters_new = n_clusters + 1 kmeans_new = KMeans(n_clusters=n_clusters_new, n_init="auto", random_state=0).fit(values_to_cluster) inertias.append(kmeans_new.inertia_) if terminate_clustering(inertias, threshold): break kmeans = kmeans_new n_clusters += 1 return kmeans.labels_ def terminate_clustering(inertias, threshold): # method: compute relative improvement toward previous step assert len(inertias) > 1 improvement = 1 - (inertias[-1] / inertias[-2]) return improvement < threshold def cluster_existed_features(network_folder_path, classes, layers_indexes, taus): appendixes = ["_correctly_classified_features.csv", "_incorrectly_classified_features.csv"] product = ((i, y, appendix) for i in layers_indexes for y in classes for appendix in appendixes) for i, y, appendix in product: start_time = time.time() # load data for class y at layer minus i features_file_path = network_folder_path +"Layer_minus_" + str(i) + "/class_" + str(y) + appendix df = pd.read_csv(features_file_path) index_values = df["index"].to_numpy() values_to_cluster = df[df.columns[3:]].to_numpy() if len(values_to_cluster): # specify path and then load existing clustering results k_and_taus = dict() taus_existed = [] clustering_results = pd.DataFrame(df, columns=["index", "true_label", "pred_label"]) clustering_results_path = network_folder_path + "Layer_minus_" + str(i) + "/clustering_results_class_" + str(y) + appendix if os.path.exists(clustering_results_path): clustering_results = pd.read_csv(clustering_results_path) for col in clustering_results.columns[3:]: k_and_taus[col] = clustering_results[col].max() + 1 # update the existing values of tau taus_existed = [float(key) for key in k_and_taus.keys()] # remove existing tau from list existed_taus taus_new = [tau for tau in taus if tau not in taus_existed] # iterate every tau to cluster the given data for tau in taus_new: # fix starting searching point k_start = 1 bigger_taus = [x for x in taus_existed if x > tau] if len(bigger_taus): tau_closest = min(bigger_taus) k_start = k_and_taus[str(tau_closest)] # start to cluster cluster_labels = modified_kmeans_cluster(values_to_cluster, tau, k_start) clustering_results[str(tau)] = cluster_labels taus_existed.append(tau) k_and_taus[str(tau)] = max(cluster_labels) + 1 clustering_results.to_csv(clustering_results_path, index = False) elapsed_time = time.time() - start_time print("file:" + "Layer_minus_" + str(i) + "_class_" + str(y) + appendix + ",", "lasting time:", elapsed_time, "seconds") def features_clustering(features, taus, nb_clusters): start_time = time.time() values_to_cluster = features if len(values_to_cluster): # specify path and then load existing clustering results k_and_taus = dict() taus_existed = [] # if os.path.exists(clustering_results_path): # clustering_results = pd.read_csv(clustering_results_path) # for col in clustering_results.columns[3:]: # k_and_taus[col] = clustering_results[col].max() + 1 # else: # clustering_results = pd.DataFrame() # update the existing values of tau taus_existed = [float(key) for key in k_and_taus.keys()] # remove existing tau from list existed_taus taus_new = [tau for tau in taus if tau not in taus_existed] clustering_results = dict() # iterate every tau to cluster the given data for tau in taus_new: # fix starting searching point k_start = 1 bigger_taus = [x for x in taus_existed if x > tau] if len(bigger_taus): tau_closest = min(bigger_taus) k_start = k_and_taus[str(tau_closest)] # start to cluster cluster_labels = modified_kmeans_cluster(values_to_cluster, tau, k_start, nb_clusters) clustering_results[str(tau)] = cluster_labels taus_existed.append(tau) k_and_taus[str(tau)] = max(cluster_labels) + 1 # clustering_results.to_csv(clustering_results_path, index = False) elapsed_time = time.time() - start_time # print("clustering time:", elapsed_time, "seconds") return clustering_results