Spaces:
No application file
No application file
File size: 994 Bytes
21b78eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from common_module import *
import hdbscan
from sklearn.preprocessing import StandardScaler
from hdbscan import HDBSCAN
class documentClustering:
def __init__(self, data_path):
# instantiate scaler object
if os.path.exists(os.path.join(data_path, "models/scaler_model.pkl")):
self.scaler = pickle.load(open(os.path.join(data_path, "models/scaler_model.pkl"), 'rb'))
else:
self.scaler = StandardScaler()
self.clusterer = pickle.load(open(os.path.join(data_path, "models/clusterer_model.pkl"), 'rb'))
def train_cluster(self, train_df):
df = self.scaler.fit_transform(train_df)
model = HDBSCAN(min_cluster_size=100, min_samples=1, metric='euclidean', cluster_selection_method='eom', gen_min_span_tree=True, prediction_data=True).fit(df)
return model, self.scaler
def test_cluster(self, test_df):
df = self.scaler.transform(test_df)
test_labels, strengths = hdbscan.approximate_predict(self.clusterer, df)
return test_labels
|