Spaces:
No application file
No application file
from common_module import * | |
import hdbscan | |
from sklearn.preprocessing import StandardScaler | |
from hdbscan import HDBSCAN | |
class documentClustering: | |
def __init__(self, data_path): | |
# instantiate scaler object | |
if os.path.exists(os.path.join(data_path, "models/scaler_model.pkl")): | |
self.scaler = pickle.load(open(os.path.join(data_path, "models/scaler_model.pkl"), 'rb')) | |
else: | |
self.scaler = StandardScaler() | |
self.clusterer = pickle.load(open(os.path.join(data_path, "models/clusterer_model.pkl"), 'rb')) | |
def train_cluster(self, train_df): | |
df = self.scaler.fit_transform(train_df) | |
model = HDBSCAN(min_cluster_size=100, min_samples=1, metric='euclidean', cluster_selection_method='eom', gen_min_span_tree=True, prediction_data=True).fit(df) | |
return model, self.scaler | |
def test_cluster(self, test_df): | |
df = self.scaler.transform(test_df) | |
test_labels, strengths = hdbscan.approximate_predict(self.clusterer, df) | |
return test_labels | |