Spaces:

GIZ
/

Development-Project-Synergy-Finder

Sleeping

App Files Files Community

Jan Mühlnikel commited on Apr 13, 2024

Commit

f17e764

1 Parent(s): ac6359f

sparse matrix changes

Browse files

Files changed (4) hide show

functions/calc_matches.py +41 -0
functions/single_similar.py +29 -0
similarity_page.py +15 -0
src/extended_similarities.npz +2 -2

functions/calc_matches.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import pandas as pd
 import numpy as np
 def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
     # matching project2 can be nay project
     # indecies (rows) = project1
@@ -31,6 +33,45 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
     p2_df["similarity"] = top_values
     return p1_df, p2_df

 import pandas as pd
 import numpy as np
+from scipy.sparse import csr_matrix, lil_matrix
+"""
 def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
     # matching project2 can be nay project
     # indecies (rows) = project1
     p2_df["similarity"] = top_values
     return p1_df, p2_df
+"""
+def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
+    # Ensure the matrix is in a suitable format for manipulation
+    if not isinstance(similarity_matrix, csr_matrix):
+        similarity_matrix = csr_matrix(similarity_matrix)
+    # Get indices from dataframes
+    filtered_df_indices = filtered_df.index.to_list()
+    project_df_indices = project_df.index.to_list()
+    # Efficiently zero out diagonal elements if necessary
+    if np.array_equal(filtered_df_indices, project_df_indices):
+        similarity_matrix = lil_matrix(similarity_matrix)
+        similarity_matrix.setdiag(0)
+        similarity_matrix = csr_matrix(similarity_matrix)
+    # Select submatrix based on indices from both dataframes
+    match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
+    # Get the linear indices of the top 'top_x' values
+    # (flattened index to handle the sparse matrix more effectively)
+    linear_indices = np.argsort(match_matrix.data)[-top_x:]
+    if len(linear_indices) < top_x:
+        top_x = len(linear_indices)
+    # Convert flat indices to 2D indices using the shape of the submatrix
+    top_indices = np.unravel_index(linear_indices, match_matrix.shape)
+    # Get the corresponding similarity values
+    top_values = match_matrix.data[linear_indices]
+    # Create resulting dataframes with top matches and their similarity scores
+    p1_df = filtered_df.iloc[top_indices[0]].copy()
+    p1_df['similarity'] = top_values
+    p2_df = project_df.iloc[top_indices[1]].copy()
+    p2_df['similarity'] = top_values
+    return p1_df, p2_df

functions/single_similar.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import pandas as pd
 import numpy as np
 def find_similar(p_index, similarity_matrix, filtered_df, top_x):
     # filter out just projects from filtered df
@@ -21,5 +23,32 @@ def find_similar(p_index, similarity_matrix, filtered_df, top_x):
     result_df["similarity"] = top_10_values_descending
     return result_df

 import pandas as pd
 import numpy as np
+from scipy.sparse import csr_matrix
+"""
 def find_similar(p_index, similarity_matrix, filtered_df, top_x):
     # filter out just projects from filtered df
     result_df["similarity"] = top_10_values_descending
     return result_df
+"""
+def find_similar(p_index, similarity_matrix, filtered_df, top_x):
+    # Ensure the similarity_matrix is in a suitable sparse format like CSR
+    if not isinstance(similarity_matrix, csr_matrix):
+        similarity_matrix = csr_matrix(similarity_matrix)
+    # Filter out just projects from filtered_df
+    filtered_indices = filtered_df.index.tolist()
+    # Create a mapping from new position to original indices
+    index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}
+    # Extract the submatrix corresponding to the filtered indices
+    filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]
+    # Extract the row for the selected project efficiently
+    # Convert the sparse row slice to a dense array for argsort function
+    project_row = filtered_column_sim_matrix.getrow(p_index).toarray().ravel()
+    # Find top_x indices with the highest similarity scores
+    sorted_indices = np.argsort(project_row)[-top_x:][::-1]
+    top_indices = [index_position_mapping[i] for i in sorted_indices]
+    top_values = project_row[sorted_indices]
+    # Prepare the result DataFrame
+    result_df = filtered_df.loc[top_indices]
+    result_df['similarity'] = top_values
+    return result_df

similarity_page.py CHANGED Viewed

@@ -28,21 +28,36 @@ def get_process_memory():
 """
 # Catch DATA
 # Load Similarity matrix
 @st.cache_data
 def load_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities.npz")
     dense_matrix = loaded_matrix.toarray().astype('float16')
     return dense_matrix
 # Load Non Similar Orga Matrix
 @st.cache_data
 def load_nonsameorga_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
     dense_matrix = loaded_matrix.toarray().astype('float16')
     return dense_matrix
 # Load Projects DFs
 @st.cache_data

 """
 # Catch DATA
 # Load Similarity matrix
+"""
 @st.cache_data
 def load_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities.npz")
     dense_matrix = loaded_matrix.toarray().astype('float16')
     return dense_matrix
+"""
+@st.cache_data
+def load_sim_matrix():
+    loaded_matrix = load_npz("src/extended_similarities.npz")
+    #dense_matrix = loaded_matrix.toarray().astype('float16')
+    return loaded_matrix
 # Load Non Similar Orga Matrix
+"""
 @st.cache_data
 def load_nonsameorga_sim_matrix():
     loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
     dense_matrix = loaded_matrix.toarray().astype('float16')
     return dense_matrix
+"""
+def load_nonsameorga_sim_matrix():
+    loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
+    #dense_matrix = loaded_matrix.toarray().astype('float16')
+    return loaded_matrix
 # Load Projects DFs
 @st.cache_data

src/extended_similarities.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4de37759fc87a54415ca39e71b17a9a3b507091f3af401bf70d24bbf1a22aa9
-size 6888936

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c8747d1e71428e191cab9b6bb7187a7ede099e83f722cd8dabd133b3e994ac4
+size 2779951