Spaces:
Sleeping
Sleeping
# Description:Creates clusters based on the uploaded transcripts and returns the uuid of the documents that are similar to the query. | |
# Reference Code: https://github.com/chakib401/smoothing_sentence_embeddings/blob/master/utils.py | |
''' | |
Paper Citation for def normalize_adj(): | |
Fettal, Chakib, Lazhar Labiod, and Mohamed Nadif. | |
"More Discriminative Sentence Embeddings via Semantic Graph Smoothing." | |
arXiv preprint arXiv:2402.12890 (2024). | |
''' | |
import json | |
import os | |
import joblib | |
import numpy as np | |
import pandas as pd | |
import scipy.sparse as sp | |
import src.clustering.resonate_semantic_search as SemanticSearch | |
from dotenv import load_dotenv | |
from langchain_openai import OpenAIEmbeddings | |
from scipy.io import loadmat, savemat | |
from sklearn.cluster import MeanShift, estimate_bandwidth | |
from sklearn.neighbors import kneighbors_graph | |
def normalize_adj(adj, lmbda=1): | |
''' | |
Normalize adjacency matrix of semantic graph | |
''' | |
adj = adj + lmbda * sp.eye(adj.shape[0]) | |
rowsum = np.array(adj.sum(1)) | |
r_inv = np.power(rowsum, -1).flatten() | |
r_inv[np.isinf(r_inv)] = 0.0 | |
r_mat_inv = sp.diags(r_inv) | |
adj = r_mat_inv.dot(adj) | |
adj = adj + lmbda * sp.eye(adj.shape[0]) | |
adj = sp.coo_matrix(adj) | |
row_sum = np.array(adj.sum(1)) | |
d_inv_sqrt = np.power(row_sum, -0.5).flatten() | |
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.0 | |
d_mat_inv_sqrt = sp.diags(d_inv_sqrt) | |
return d_mat_inv_sqrt.dot(adj).dot(d_mat_inv_sqrt).tocoo() | |
def graph_filtering(features, degree=2, lmbda=1, nn=10, alpha=0.5, t=5, method="sgc"): | |
""" | |
This function will perform graph filtering based on four polynomial filters | |
We keep n=10, as per paper. And is used to calculate the graph (adjacency matrix) | |
between 10 vectors/features. | |
**That is why we have 10 pre-existing transcripts placed in pinecone (through the ont_time_script) | |
**If you want to change the number of transcripts, you will have to change the number of neighbors | |
""" | |
adj = kneighbors_graph(features, n_neighbors=nn, metric="cosine") | |
adj = (adj + adj.T) / 2 | |
S = normalize_adj(adj, lmbda) | |
xx = features | |
yy = features.copy() | |
if method in ["sgc", "s2gc"]: | |
for _ in range(degree): | |
xx = S @ xx | |
yy += xx | |
if method == "sgc": | |
return xx | |
elif method == "s2gc": | |
return yy | |
elif method == "appnp": | |
for _ in range(degree): | |
xx = (1 - alpha) * S @ xx + alpha * features | |
return xx | |
elif method == "dgc": | |
k = degree + 1 | |
for _ in range(1, degree + 1): | |
xx = (1 - t / k) * xx + (t / k) * (S @ xx) | |
return xx | |
else: | |
raise "unrecognized filter" | |
def load_json_config(json_file_path="./config/config.json"): | |
with open(json_file_path, "r") as file: | |
data = json.load(file) | |
return data | |
class Clustering: | |
def __init__(self): | |
self.api_key = os.environ.get("OPENAI_API_KEY") | |
self.method = "dgc" | |
if not os.path.exists("./data/clusteringFiles/cluster_data.csv"): | |
self.create_Cluster() | |
self.index = self.initialize_FAISS() | |
def create_embedding(self): | |
'''This function will perform two task: | |
1. embedding on entire data, abstract_data.csv | |
2. save embeddings in cluster_data-embedding.mat in format uuid, text | |
''' | |
data = pd.read_csv("./data/summaryFiles/abstract_summary_data.csv") | |
json_config = load_json_config() | |
text, id = data["text"], data["uuid"] | |
# embedding model | |
embed = OpenAIEmbeddings( | |
model=json_config["EMBEDDING_MODEL_NAME"], openai_api_key=self.api_key | |
) | |
embeddings = embed.embed_documents(text) | |
savemat( | |
"./data/embeddingFiles/cluster-embedding.mat", | |
{"uuid": id, "text": embeddings}, | |
) | |
def create_Cluster(self): | |
''' | |
This function will perform following tasks: | |
1. call embedding function | |
2. form clusters using cluste_data-embedding.mat file | |
3. Save predicted labels in cluster_data.csv | |
''' | |
self.create_embedding() | |
data = loadmat("./data/embeddingFiles/cluster-embedding.mat") | |
features1 = data["text"] | |
features = graph_filtering(features1, method=self.method) | |
ibandwidth = estimate_bandwidth(features, quantile=0.30, random_state=42) | |
msclustering = MeanShift(bandwidth=ibandwidth, max_iter=900) | |
msclustering.fit(features) | |
model_path = f"./data/clusteringFiles/{self.method}_model.joblib" | |
joblib.dump(msclustering, model_path) | |
print("Model saved") | |
df = pd.read_csv(f"./data/summaryFiles/abstract_summary_data.csv") | |
df["cluster"] = msclustering.predict(features) | |
df.to_csv("./data/clusteringFiles/cluster_data.csv") | |
print("Cluster data saved") | |
self.index = self.initialize_FAISS() | |
def uuid_for_query(self, query): | |
''' | |
Returns the uuids of the documents that are similar to the query, based on the clustering | |
''' | |
query_cluster_label = self.index.search_query(query) | |
print(f"Predicted Label : {query_cluster_label[0]}") | |
df = pd.read_csv("./data/clusteringFiles/cluster_data.csv") | |
filtered_uuids = df[df["cluster"] == query_cluster_label[0]]["uuid"].tolist() | |
return filtered_uuids | |
def initialize_FAISS(self): | |
model = SemanticSearch.SemanticEmbedding() | |
index = SemanticSearch.FaissForQuerySearch(model) | |
data = pd.read_csv("./data/clusteringFiles/cluster_data.csv") | |
features1 = data["text"] | |
uuids = data["uuid"] | |
labels = data["cluster"] | |
for text, uuid, label in zip(features1, uuids, labels): | |
index.add_summary(text, uuid, label) | |
return index | |
if __name__ == "__main__": | |
#load_dotenv("./config/.env") | |
Clustering_obj = Clustering() | |
print( | |
Clustering_obj.uuid_for_query( | |
"What is the goal of defining maintainability for the new diffs architecture?" | |
) | |
) | |
print( | |
Clustering_obj.uuid_for_query( | |
"What was the design component for remote control?" | |
) | |
) | |