# Description:Creates clusters based on the uploaded transcripts and returns the uuid of the documents that are similar to the query. # Reference Code: https://github.com/chakib401/smoothing_sentence_embeddings/blob/master/utils.py ''' Paper Citation for def normalize_adj(): Fettal, Chakib, Lazhar Labiod, and Mohamed Nadif. "More Discriminative Sentence Embeddings via Semantic Graph Smoothing." arXiv preprint arXiv:2402.12890 (2024). ''' import json import os import joblib import numpy as np import pandas as pd import scipy.sparse as sp import src.clustering.resonate_semantic_search as SemanticSearch from dotenv import load_dotenv from langchain_openai import OpenAIEmbeddings from scipy.io import loadmat, savemat from sklearn.cluster import MeanShift, estimate_bandwidth from sklearn.neighbors import kneighbors_graph def normalize_adj(adj, lmbda=1): ''' Normalize adjacency matrix of semantic graph ''' adj = adj + lmbda * sp.eye(adj.shape[0]) rowsum = np.array(adj.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0.0 r_mat_inv = sp.diags(r_inv) adj = r_mat_inv.dot(adj) adj = adj + lmbda * sp.eye(adj.shape[0]) adj = sp.coo_matrix(adj) row_sum = np.array(adj.sum(1)) d_inv_sqrt = np.power(row_sum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0 d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo() def graph_filtering(features, degree=2, lmbda=1, nn=10, alpha=0.5, t=5, method="sgc"): """ This function will perform graph filtering based on four polynomial filters We keep n=10, as per paper. And is used to calculate the graph (adjacency matrix) between 10 vectors/features. **That is why we have 10 pre-existing transcripts placed in pinecone (through the ont_time_script) **If you want to change the number of transcripts, you will have to change the number of neighbors """ adj = kneighbors_graph(features, n_neighbors=nn, metric="cosine") adj = (adj + adj.T) / 2 S = normalize_adj(adj, lmbda) xx = features yy = features.copy() if method in ["sgc", "s2gc"]: for _ in range(degree): xx = S @ xx yy += xx if method == "sgc": return xx elif method == "s2gc": return yy elif method == "appnp": for _ in range(degree): xx = (1 - alpha) * S @ xx + alpha * features return xx elif method == "dgc": k = degree + 1 for _ in range(1, degree + 1): xx = (1 - t / k) * xx + (t / k) * (S @ xx) return xx else: raise "unrecognized filter" def load_json_config(json_file_path="./config/config.json"): with open(json_file_path, "r") as file: data = json.load(file) return data class Clustering: def __init__(self): self.api_key = os.environ.get("OPENAI_API_KEY") self.method = "dgc" if not os.path.exists("./data/clusteringFiles/cluster_data.csv"): self.create_Cluster() self.index = self.initialize_FAISS() def create_embedding(self): '''This function will perform two task: 1. embedding on entire data, abstract_data.csv 2. save embeddings in cluster_data-embedding.mat in format uuid, text ''' data = pd.read_csv("./data/summaryFiles/abstract_summary_data.csv") json_config = load_json_config() text, id = data["text"], data["uuid"] # embedding model embed = OpenAIEmbeddings( model=json_config["EMBEDDING_MODEL_NAME"], openai_api_key=self.api_key ) embeddings = embed.embed_documents(text) savemat( "./data/embeddingFiles/cluster-embedding.mat", {"uuid": id, "text": embeddings}, ) def create_Cluster(self): ''' This function will perform following tasks: 1. call embedding function 2. form clusters using cluste_data-embedding.mat file 3. Save predicted labels in cluster_data.csv ''' self.create_embedding() data = loadmat("./data/embeddingFiles/cluster-embedding.mat") features1 = data["text"] features = graph_filtering(features1, method=self.method) ibandwidth = estimate_bandwidth(features, quantile=0.30, random_state=42) msclustering = MeanShift(bandwidth=ibandwidth, max_iter=900) msclustering.fit(features) model_path = f"./data/clusteringFiles/{self.method}_model.joblib" joblib.dump(msclustering, model_path) print("Model saved") df = pd.read_csv(f"./data/summaryFiles/abstract_summary_data.csv") df["cluster"] = msclustering.predict(features) df.to_csv("./data/clusteringFiles/cluster_data.csv") print("Cluster data saved") self.index = self.initialize_FAISS() def uuid_for_query(self, query): ''' Returns the uuids of the documents that are similar to the query, based on the clustering ''' query_cluster_label = self.index.search_query(query) print(f"Predicted Label : {query_cluster_label[0]}") df = pd.read_csv("./data/clusteringFiles/cluster_data.csv") filtered_uuids = df[df["cluster"] == query_cluster_label[0]]["uuid"].tolist() return filtered_uuids def initialize_FAISS(self): model = SemanticSearch.SemanticEmbedding() index = SemanticSearch.FaissForQuerySearch(model) data = pd.read_csv("./data/clusteringFiles/cluster_data.csv") features1 = data["text"] uuids = data["uuid"] labels = data["cluster"] for text, uuid, label in zip(features1, uuids, labels): index.add_summary(text, uuid, label) return index if __name__ == "__main__": load_dotenv("./config/.env") Clustering_obj = Clustering() print( Clustering_obj.uuid_for_query( "What is the goal of defining maintainability for the new diffs architecture?" ) ) print( Clustering_obj.uuid_for_query( "What was the design component for remote control?" ) )