from typing import List from .config import BaselineConfig, Configuration from ..utils import __create_model__ import numpy as np # from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler # from yellowbrick.cluster import KElbowVisualizer from .clusters import ClusterList from unsupervised_learning.clustering import GaussianMixture, Silhouette class ClusterPipeline: def __init__(self, config:Configuration = None): if config is None: self.__setup__(BaselineConfig()) else: self.__setup__(config) def __setup__(self, config:Configuration): self.PTM = __create_model__(config.plm) self.dimension_reduction = __create_model__(config.dimension_reduction) self.clustering = __create_model__(config.clustering) self.keywords_extraction = __create_model__(config.keywords_extraction) def __1_generate_word_embeddings__(self, documents: List[str]): ''' :param documents: a list of N strings: :return: np.ndarray: Nx384 (sentence-transformers) ''' print(f'>>> start generating word embeddings...') print(f'>>> successfully generated word embeddings...') return self.PTM.encode(documents) def __2_dimenstion_reduction__(self, embeddings): ''' :param embeddings: NxD :return: Nxd, d<>> start dimension reduction...') embeddings = self.dimension_reduction.dimension_reduction(embeddings) print(f'>>> finished dimension reduction...') return embeddings def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10, standarization = False): ''' :param embeddings: Nxd :return: ''' if self.clustering is None: return embeddings else: print(f'>>> start clustering...') ######## new: standarization ######## if standarization: print(f'>>> start standardization...') scaler = StandardScaler() embeddings = scaler.fit_transform(embeddings) print(f'>>> finished standardization...') ######## new: standarization ######## best_k_algo = Silhouette(GaussianMixture,2,max_k) best_k = best_k_algo.get_best_k(embeddings) print(f'>>> The best K is {best_k}.') labels, cluster_centers = self.clustering(embeddings, k=best_k) clusters = ClusterList(best_k) clusters.instantiate(labels) print(f'>>> finished clustering...') if return_cluster_centers: return clusters, cluster_centers return clusters def __4_keywords_extraction__(self, clusters: ClusterList, documents: List[str]): ''' :param clusters: N documents :return: clusters, where each cluster has added keyphrases ''' if self.keywords_extraction is None: return clusters else: print(f'>>> start keywords extraction') for cluster in clusters: doc_ids = cluster.elements() input_abstracts = [documents[i] for i in doc_ids] #[str] keyphrases = self.keywords_extraction(input_abstracts) #[{keys...}] cluster.add_keyphrase(keyphrases) # for doc_id in doc_ids: # keyphrases = self.keywords_extraction(documents[doc_id]) # cluster.add_keyphrase(keyphrases) print(f'>>> finished keywords extraction') return clusters def __call__(self, documents: List[str], max_k:int, standarization = False): print(f'>>> pipeline starts...') x = self.__1_generate_word_embeddings__(documents) x = self.__2_dimenstion_reduction__(x) clusters = self.__3_clustering__(x,max_k=max_k,standarization=standarization) outputs = self.__4_keywords_extraction__(clusters, documents) print(f'>>> pipeline finished!\n') return outputs