Jan Mühlnikel
commited on
Commit
·
f17e764
1
Parent(s):
ac6359f
sparse matrix changes
Browse files- functions/calc_matches.py +41 -0
- functions/single_similar.py +29 -0
- similarity_page.py +15 -0
- src/extended_similarities.npz +2 -2
functions/calc_matches.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
|
|
3 |
|
|
|
4 |
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
5 |
# matching project2 can be nay project
|
6 |
# indecies (rows) = project1
|
@@ -31,6 +33,45 @@ def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
|
31 |
p2_df["similarity"] = top_values
|
32 |
|
33 |
return p1_df, p2_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
|
36 |
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
+
from scipy.sparse import csr_matrix, lil_matrix
|
4 |
|
5 |
+
"""
|
6 |
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
7 |
# matching project2 can be nay project
|
8 |
# indecies (rows) = project1
|
|
|
33 |
p2_df["similarity"] = top_values
|
34 |
|
35 |
return p1_df, p2_df
|
36 |
+
"""
|
37 |
+
|
38 |
+
def calc_matches(filtered_df, project_df, similarity_matrix, top_x):
|
39 |
+
# Ensure the matrix is in a suitable format for manipulation
|
40 |
+
if not isinstance(similarity_matrix, csr_matrix):
|
41 |
+
similarity_matrix = csr_matrix(similarity_matrix)
|
42 |
+
|
43 |
+
# Get indices from dataframes
|
44 |
+
filtered_df_indices = filtered_df.index.to_list()
|
45 |
+
project_df_indices = project_df.index.to_list()
|
46 |
+
|
47 |
+
# Efficiently zero out diagonal elements if necessary
|
48 |
+
if np.array_equal(filtered_df_indices, project_df_indices):
|
49 |
+
similarity_matrix = lil_matrix(similarity_matrix)
|
50 |
+
similarity_matrix.setdiag(0)
|
51 |
+
similarity_matrix = csr_matrix(similarity_matrix)
|
52 |
+
|
53 |
+
# Select submatrix based on indices from both dataframes
|
54 |
+
match_matrix = similarity_matrix[filtered_df_indices, :][:, project_df_indices]
|
55 |
+
|
56 |
+
# Get the linear indices of the top 'top_x' values
|
57 |
+
# (flattened index to handle the sparse matrix more effectively)
|
58 |
+
linear_indices = np.argsort(match_matrix.data)[-top_x:]
|
59 |
+
if len(linear_indices) < top_x:
|
60 |
+
top_x = len(linear_indices)
|
61 |
+
|
62 |
+
# Convert flat indices to 2D indices using the shape of the submatrix
|
63 |
+
top_indices = np.unravel_index(linear_indices, match_matrix.shape)
|
64 |
+
|
65 |
+
# Get the corresponding similarity values
|
66 |
+
top_values = match_matrix.data[linear_indices]
|
67 |
+
|
68 |
+
# Create resulting dataframes with top matches and their similarity scores
|
69 |
+
p1_df = filtered_df.iloc[top_indices[0]].copy()
|
70 |
+
p1_df['similarity'] = top_values
|
71 |
+
p2_df = project_df.iloc[top_indices[1]].copy()
|
72 |
+
p2_df['similarity'] = top_values
|
73 |
+
|
74 |
+
return p1_df, p2_df
|
75 |
|
76 |
|
77 |
|
functions/single_similar.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
|
|
3 |
|
|
|
4 |
def find_similar(p_index, similarity_matrix, filtered_df, top_x):
|
5 |
|
6 |
# filter out just projects from filtered df
|
@@ -21,5 +23,32 @@ def find_similar(p_index, similarity_matrix, filtered_df, top_x):
|
|
21 |
result_df["similarity"] = top_10_values_descending
|
22 |
|
23 |
return result_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
+
from scipy.sparse import csr_matrix
|
4 |
|
5 |
+
"""
|
6 |
def find_similar(p_index, similarity_matrix, filtered_df, top_x):
|
7 |
|
8 |
# filter out just projects from filtered df
|
|
|
23 |
result_df["similarity"] = top_10_values_descending
|
24 |
|
25 |
return result_df
|
26 |
+
"""
|
27 |
+
def find_similar(p_index, similarity_matrix, filtered_df, top_x):
|
28 |
+
# Ensure the similarity_matrix is in a suitable sparse format like CSR
|
29 |
+
if not isinstance(similarity_matrix, csr_matrix):
|
30 |
+
similarity_matrix = csr_matrix(similarity_matrix)
|
31 |
+
|
32 |
+
# Filter out just projects from filtered_df
|
33 |
+
filtered_indices = filtered_df.index.tolist()
|
34 |
|
35 |
+
# Create a mapping from new position to original indices
|
36 |
+
index_position_mapping = {position: index for position, index in enumerate(filtered_indices)}
|
37 |
+
|
38 |
+
# Extract the submatrix corresponding to the filtered indices
|
39 |
+
filtered_column_sim_matrix = similarity_matrix[:, filtered_indices]
|
40 |
|
41 |
+
# Extract the row for the selected project efficiently
|
42 |
+
# Convert the sparse row slice to a dense array for argsort function
|
43 |
+
project_row = filtered_column_sim_matrix.getrow(p_index).toarray().ravel()
|
44 |
+
|
45 |
+
# Find top_x indices with the highest similarity scores
|
46 |
+
sorted_indices = np.argsort(project_row)[-top_x:][::-1]
|
47 |
+
top_indices = [index_position_mapping[i] for i in sorted_indices]
|
48 |
+
top_values = project_row[sorted_indices]
|
49 |
+
|
50 |
+
# Prepare the result DataFrame
|
51 |
+
result_df = filtered_df.loc[top_indices]
|
52 |
+
result_df['similarity'] = top_values
|
53 |
+
|
54 |
+
return result_df
|
similarity_page.py
CHANGED
@@ -28,21 +28,36 @@ def get_process_memory():
|
|
28 |
"""
|
29 |
|
30 |
# Catch DATA
|
|
|
31 |
# Load Similarity matrix
|
|
|
32 |
@st.cache_data
|
33 |
def load_sim_matrix():
|
34 |
loaded_matrix = load_npz("src/extended_similarities.npz")
|
35 |
dense_matrix = loaded_matrix.toarray().astype('float16')
|
36 |
|
37 |
return dense_matrix
|
|
|
|
|
|
|
|
|
|
|
38 |
|
|
|
39 |
# Load Non Similar Orga Matrix
|
|
|
40 |
@st.cache_data
|
41 |
def load_nonsameorga_sim_matrix():
|
42 |
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
43 |
dense_matrix = loaded_matrix.toarray().astype('float16')
|
44 |
|
45 |
return dense_matrix
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
# Load Projects DFs
|
48 |
@st.cache_data
|
|
|
28 |
"""
|
29 |
|
30 |
# Catch DATA
|
31 |
+
|
32 |
# Load Similarity matrix
|
33 |
+
"""
|
34 |
@st.cache_data
|
35 |
def load_sim_matrix():
|
36 |
loaded_matrix = load_npz("src/extended_similarities.npz")
|
37 |
dense_matrix = loaded_matrix.toarray().astype('float16')
|
38 |
|
39 |
return dense_matrix
|
40 |
+
"""
|
41 |
+
@st.cache_data
|
42 |
+
def load_sim_matrix():
|
43 |
+
loaded_matrix = load_npz("src/extended_similarities.npz")
|
44 |
+
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
45 |
|
46 |
+
return loaded_matrix
|
47 |
# Load Non Similar Orga Matrix
|
48 |
+
"""
|
49 |
@st.cache_data
|
50 |
def load_nonsameorga_sim_matrix():
|
51 |
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
52 |
dense_matrix = loaded_matrix.toarray().astype('float16')
|
53 |
|
54 |
return dense_matrix
|
55 |
+
"""
|
56 |
+
def load_nonsameorga_sim_matrix():
|
57 |
+
loaded_matrix = load_npz("src/extended_similarities_nonsimorga.npz")
|
58 |
+
#dense_matrix = loaded_matrix.toarray().astype('float16')
|
59 |
+
|
60 |
+
return loaded_matrix
|
61 |
|
62 |
# Load Projects DFs
|
63 |
@st.cache_data
|
src/extended_similarities.npz
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c8747d1e71428e191cab9b6bb7187a7ede099e83f722cd8dabd133b3e994ac4
|
3 |
+
size 2779951
|