import my_new_openai import my_1_writer import json import numpy as np # sim search with dot_product and lin_distance # the newly vectorized TERM will be added to the database # database = .json file def sim_search_load_db(database, term, add_to_db=True, debug=False): if type(term) == str: print("str") vector1 = my_new_openai.vectorize_data(term) elif type(term) == list: print("list") vector1 = term else: print("invalid search_term/search_vector format") return with open(database, "r") as f: table = json.load(f) sim_search_dict = {} for key in table.keys(): vector2 = table[key] if debug: print("") print(f"{vector1}") print(f"{vector2}") print(f"doing dot product for {key} and {term}") dp = np.dot(vector1, vector2) distance = np.linalg.norm(np.array(vector1) - np.array(vector2)) if debug: print(f"the dp is {dp}") print(f"the distance is{distance}") print("") print("") print("") sim_search_dict[key] = dp * distance # sort with the biggest similarity sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True)) if debug: for key, value in sorted_table[:5]: print(f"{key}: {value}") if add_to_db: if term in table.keys(): print("the search term is in the database!") # add the newly vectorized term to the words, if not already in the vector table else: if database != "session/my_words_vec_table.json": database = "session/my_vecs.json" # table = load_df(database) # ?? table[str(term)] = vector1 my_1_writer.safe_my_dict_as_json(database, table) # first_key, first_value = list(sortedTable.items())[0] print(f"the closest word to your input is: {list(sorted_table.keys())[0]}") return sorted_table def dot_p_to_1(database, vector1=0, analysis_filename=0): with open(database, "r") as f: table = json.load(f) dot_product_to1 = {} if vector1 == 0: vector1 = [0.025515518153991442 for _ in range(1536)] elif vector1 == 1: vector1 = table[str(list(table.keys())[0])] for key in table.keys(): dot_product_to1[key] = np.dot(vector1, table[key]) my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1) print("dot p to 1 saved") def lin_dist(database, vector1=0, analysis_filename=0): with open(database, "r") as f: table = json.load(f) lin_dist_to_1 = {} if vector1 == 0: vector1 = [0.025515518153991442 for _ in range(1536)] elif vector1 == 1: vector1 = table[str(list(table.keys())[0])] for key in table.keys(): lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key])) my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1) print("lin dist to 1 saved") def manhattan_dist(database, vector1=0, analysis_filename=0): with open(database, "r") as f: table = json.load(f) manhattan_dist_to_1 = {} if vector1 == 0: vector1 = [0.025515518153991442 for _ in range(1536)] elif vector1 == 1: vector1 = table[str(list(table.keys())[0])] for key in table.keys(): manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key])) my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1) print("manhattan dist to 1 saved") #vec_table def sim_search_fly(vec_table, term, debug=False): if debug: print(type(vec_table)) print(type(term)) print(type(vec_table[list(vec_table.keys())[0]])) print("vec table:") print(vec_table[list(vec_table.keys())[5]][:4]) print("search term") print(term[:4]) if type(term) == str: print("str") vector1 = my_new_openai.vectorize_data(term) elif type(term) == list: print("list") vector1 = term else: print("invalid search_term/search_vector format") return sim_search_dict = {} for key in vec_table.keys(): vector2 = vec_table[key] if debug: print("") print(f"{vector1}") print(f"{vector2}") print(f"doing dot product for {key} and {term}") if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]: dp = 200 else: dp = np.dot(vector1, vector2) #distance = np.linalg.norm(np.array(vector1) - np.array(vector2)) if debug: print(f"the dp is {dp}") #print(f"the distance is{distance}") print("") print("") print("") sim_search_dict[key] = dp #* distance # sort with the biggest similarity sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True)) if debug: for key, value in sorted_table[:5]: print(f"{key}: {value}") # first_key, first_value = list(sortedTable.items())[0] print(f"the closest word to your input is: {list(sorted_table.keys())[0]}") return sorted_table