Spaces:

sartajbhuvaji
/

resonate

Sleeping

App Files Files Community

resonate / src /clustering /resonate_clustering.py

sartajbhuvaji

Added files from branch huggingface

5f773d1 verified 12 months ago

raw

history blame contribute delete

6.22 kB

	# Description:Creates clusters based on the uploaded transcripts and returns the uuid of the documents that are similar to the query.
	# Reference Code: https://github.com/chakib401/smoothing_sentence_embeddings/blob/master/utils.py

	'''
	Paper Citation for def normalize_adj():
	Fettal, Chakib, Lazhar Labiod, and Mohamed Nadif.
	"More Discriminative Sentence Embeddings via Semantic Graph Smoothing."
	arXiv preprint arXiv:2402.12890 (2024).
	'''

	import json
	import os
	import joblib
	import numpy as np
	import pandas as pd
	import scipy.sparse as sp
	import src.clustering.resonate_semantic_search as SemanticSearch
	from dotenv import load_dotenv
	from langchain_openai import OpenAIEmbeddings
	from scipy.io import loadmat, savemat
	from sklearn.cluster import MeanShift, estimate_bandwidth
	from sklearn.neighbors import kneighbors_graph

	def normalize_adj(adj, lmbda=1):
	'''
	Normalize adjacency matrix of semantic graph
	'''
	adj = adj + lmbda * sp.eye(adj.shape[0])
	rowsum = np.array(adj.sum(1))
	r_inv = np.power(rowsum, -1).flatten()
	r_inv[np.isinf(r_inv)] = 0.0
	r_mat_inv = sp.diags(r_inv)
	adj = r_mat_inv.dot(adj)

	adj = adj + lmbda * sp.eye(adj.shape[0])
	adj = sp.coo_matrix(adj)
	row_sum = np.array(adj.sum(1))
	d_inv_sqrt = np.power(row_sum, -0.5).flatten()
	d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0
	d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
	return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo()


	def graph_filtering(features, degree=2, lmbda=1, nn=10, alpha=0.5, t=5, method="sgc"):
	"""
	This function will perform graph filtering based on four polynomial filters
	We keep n=10, as per paper. And is used to calculate the graph (adjacency matrix)
	between 10 vectors/features.

	**That is why we have 10 pre-existing transcripts placed in pinecone (through the ont_time_script)
	**If you want to change the number of transcripts, you will have to change the number of neighbors
	"""
	adj = kneighbors_graph(features, n_neighbors=nn, metric="cosine")
	adj = (adj + adj.T) / 2

	S = normalize_adj(adj, lmbda)
	xx = features
	yy = features.copy()
	if method in ["sgc", "s2gc"]:
	for _ in range(degree):
	xx = S @ xx
	yy += xx
	if method == "sgc":
	return xx
	elif method == "s2gc":
	return yy
	elif method == "appnp":
	for _ in range(degree):
	xx = (1 - alpha) * S @ xx + alpha * features
	return xx
	elif method == "dgc":
	k = degree + 1
	for _ in range(1, degree + 1):
	xx = (1 - t / k) * xx + (t / k) * (S @ xx)
	return xx
	else:
	raise "unrecognized filter"


	def load_json_config(json_file_path="./config/config.json"):
	with open(json_file_path, "r") as file:
	data = json.load(file)
	return data


	class Clustering:
	def __init__(self):
	self.api_key = os.environ.get("OPENAI_API_KEY")
	self.method = "dgc"

	if not os.path.exists("./data/clusteringFiles/cluster_data.csv"):
	self.create_Cluster()

	self.index = self.initialize_FAISS()

	def create_embedding(self):
	'''This function will perform two task:
	1. embedding on entire data, abstract_data.csv
	2. save embeddings in cluster_data-embedding.mat in format uuid, text
	'''
	data = pd.read_csv("./data/summaryFiles/abstract_summary_data.csv")
	json_config = load_json_config()

	text, id = data["text"], data["uuid"]
	# embedding model
	embed = OpenAIEmbeddings(
	model=json_config["EMBEDDING_MODEL_NAME"], openai_api_key=self.api_key
	)
	embeddings = embed.embed_documents(text)
	savemat(
	"./data/embeddingFiles/cluster-embedding.mat",
	{"uuid": id, "text": embeddings},
	)


	def create_Cluster(self):
	'''
	This function will perform following tasks:
	1. call embedding function
	2. form clusters using cluste_data-embedding.mat file
	3. Save predicted labels in cluster_data.csv
	'''
	self.create_embedding()

	data = loadmat("./data/embeddingFiles/cluster-embedding.mat")
	features1 = data["text"]

	features = graph_filtering(features1, method=self.method)
	ibandwidth = estimate_bandwidth(features, quantile=0.30, random_state=42)
	msclustering = MeanShift(bandwidth=ibandwidth, max_iter=900)
	msclustering.fit(features)
	model_path = f"./data/clusteringFiles/{self.method}_model.joblib"
	joblib.dump(msclustering, model_path)

	print("Model saved")

	df = pd.read_csv(f"./data/summaryFiles/abstract_summary_data.csv")
	df["cluster"] = msclustering.predict(features)
	df.to_csv("./data/clusteringFiles/cluster_data.csv")
	print("Cluster data saved")
	self.index = self.initialize_FAISS()



	def uuid_for_query(self, query):
	'''
	Returns the uuids of the documents that are similar to the query, based on the clustering
	'''
	query_cluster_label = self.index.search_query(query)
	print(f"Predicted Label : {query_cluster_label[0]}")
	df = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
	filtered_uuids = df[df["cluster"] == query_cluster_label[0]]["uuid"].tolist()
	return filtered_uuids

	def initialize_FAISS(self):
	model = SemanticSearch.SemanticEmbedding()
	index = SemanticSearch.FaissForQuerySearch(model)
	data = pd.read_csv("./data/clusteringFiles/cluster_data.csv")
	features1 = data["text"]
	uuids = data["uuid"]
	labels = data["cluster"]
	for text, uuid, label in zip(features1, uuids, labels):
	index.add_summary(text, uuid, label)
	return index


	if __name__ == "__main__":
	#load_dotenv("./config/.env")
	Clustering_obj = Clustering()
	print(
	Clustering_obj.uuid_for_query(
	"What is the goal of defining maintainability for the new diffs architecture?"
	)
	)
	print(
	Clustering_obj.uuid_for_query(
	"What was the design component for remote control?"
	)
	)