ki_rag_classify / my_2_sim_search.py
elia-waefler's picture
Upload 17 files
c2b923e verified
import my_new_openai
import my_1_writer
import json
import numpy as np
# sim search with dot_product and lin_distance
# the newly vectorized TERM will be added to the database
# database = .json file
def sim_search_load_db(database, term, add_to_db=True, debug=False):
if type(term) == str:
print("str")
vector1 = my_new_openai.vectorize_data(term)
elif type(term) == list:
print("list")
vector1 = term
else:
print("invalid search_term/search_vector format")
return
with open(database, "r") as f:
table = json.load(f)
sim_search_dict = {}
for key in table.keys():
vector2 = table[key]
if debug:
print("")
print(f"{vector1}")
print(f"{vector2}")
print(f"doing dot product for {key} and {term}")
dp = np.dot(vector1, vector2)
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
if debug:
print(f"the dp is {dp}")
print(f"the distance is{distance}")
print("")
print("")
print("")
sim_search_dict[key] = dp * distance
# sort with the biggest similarity
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
if debug:
for key, value in sorted_table[:5]:
print(f"{key}: {value}")
if add_to_db:
if term in table.keys():
print("the search term is in the database!")
# add the newly vectorized term to the words, if not already in the vector table
else:
if database != "session/my_words_vec_table.json":
database = "session/my_vecs.json"
# table = load_df(database) # ??
table[str(term)] = vector1
my_1_writer.safe_my_dict_as_json(database, table)
# first_key, first_value = list(sortedTable.items())[0]
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
return sorted_table
def dot_p_to_1(database, vector1=0, analysis_filename=0):
with open(database, "r") as f:
table = json.load(f)
dot_product_to1 = {}
if vector1 == 0:
vector1 = [0.025515518153991442 for _ in range(1536)]
elif vector1 == 1:
vector1 = table[str(list(table.keys())[0])]
for key in table.keys():
dot_product_to1[key] = np.dot(vector1, table[key])
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
print("dot p to 1 saved")
def lin_dist(database, vector1=0, analysis_filename=0):
with open(database, "r") as f:
table = json.load(f)
lin_dist_to_1 = {}
if vector1 == 0:
vector1 = [0.025515518153991442 for _ in range(1536)]
elif vector1 == 1:
vector1 = table[str(list(table.keys())[0])]
for key in table.keys():
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
print("lin dist to 1 saved")
def manhattan_dist(database, vector1=0, analysis_filename=0):
with open(database, "r") as f:
table = json.load(f)
manhattan_dist_to_1 = {}
if vector1 == 0:
vector1 = [0.025515518153991442 for _ in range(1536)]
elif vector1 == 1:
vector1 = table[str(list(table.keys())[0])]
for key in table.keys():
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
print("manhattan dist to 1 saved")
#vec_table
def sim_search_fly(vec_table, term, debug=False):
if debug:
print(type(vec_table))
print(type(term))
print(type(vec_table[list(vec_table.keys())[0]]))
print("vec table:")
print(vec_table[list(vec_table.keys())[5]][:4])
print("search term")
print(term[:4])
if type(term) == str:
print("str")
vector1 = my_new_openai.vectorize_data(term)
elif type(term) == list:
print("list")
vector1 = term
else:
print("invalid search_term/search_vector format")
return
sim_search_dict = {}
for key in vec_table.keys():
vector2 = vec_table[key]
if debug:
print("")
print(f"{vector1}")
print(f"{vector2}")
print(f"doing dot product for {key} and {term}")
if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
dp = 200
else:
dp = np.dot(vector1, vector2)
#distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
if debug:
print(f"the dp is {dp}")
#print(f"the distance is{distance}")
print("")
print("")
print("")
sim_search_dict[key] = dp #* distance
# sort with the biggest similarity
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
if debug:
for key, value in sorted_table[:5]:
print(f"{key}: {value}")
# first_key, first_value = list(sortedTable.items())[0]
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
return sorted_table