Spaces:
Runtime error
Runtime error
import my_new_openai | |
import my_1_writer | |
import json | |
import numpy as np | |
# sim search with dot_product and lin_distance | |
# the newly vectorized TERM will be added to the database | |
# database = .json file | |
def sim_search_load_db(database, term, add_to_db=True, debug=False): | |
if type(term) == str: | |
print("str") | |
vector1 = my_new_openai.vectorize_data(term) | |
elif type(term) == list: | |
print("list") | |
vector1 = term | |
else: | |
print("invalid search_term/search_vector format") | |
return | |
with open(database, "r") as f: | |
table = json.load(f) | |
sim_search_dict = {} | |
for key in table.keys(): | |
vector2 = table[key] | |
if debug: | |
print("") | |
print(f"{vector1}") | |
print(f"{vector2}") | |
print(f"doing dot product for {key} and {term}") | |
dp = np.dot(vector1, vector2) | |
distance = np.linalg.norm(np.array(vector1) - np.array(vector2)) | |
if debug: | |
print(f"the dp is {dp}") | |
print(f"the distance is{distance}") | |
print("") | |
print("") | |
print("") | |
sim_search_dict[key] = dp * distance | |
# sort with the biggest similarity | |
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True)) | |
if debug: | |
for key, value in sorted_table[:5]: | |
print(f"{key}: {value}") | |
if add_to_db: | |
if term in table.keys(): | |
print("the search term is in the database!") | |
# add the newly vectorized term to the words, if not already in the vector table | |
else: | |
if database != "session/my_words_vec_table.json": | |
database = "session/my_vecs.json" | |
# table = load_df(database) # ?? | |
table[str(term)] = vector1 | |
my_1_writer.safe_my_dict_as_json(database, table) | |
# first_key, first_value = list(sortedTable.items())[0] | |
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}") | |
return sorted_table | |
def dot_p_to_1(database, vector1=0, analysis_filename=0): | |
with open(database, "r") as f: | |
table = json.load(f) | |
dot_product_to1 = {} | |
if vector1 == 0: | |
vector1 = [0.025515518153991442 for _ in range(1536)] | |
elif vector1 == 1: | |
vector1 = table[str(list(table.keys())[0])] | |
for key in table.keys(): | |
dot_product_to1[key] = np.dot(vector1, table[key]) | |
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1) | |
print("dot p to 1 saved") | |
def lin_dist(database, vector1=0, analysis_filename=0): | |
with open(database, "r") as f: | |
table = json.load(f) | |
lin_dist_to_1 = {} | |
if vector1 == 0: | |
vector1 = [0.025515518153991442 for _ in range(1536)] | |
elif vector1 == 1: | |
vector1 = table[str(list(table.keys())[0])] | |
for key in table.keys(): | |
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key])) | |
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1) | |
print("lin dist to 1 saved") | |
def manhattan_dist(database, vector1=0, analysis_filename=0): | |
with open(database, "r") as f: | |
table = json.load(f) | |
manhattan_dist_to_1 = {} | |
if vector1 == 0: | |
vector1 = [0.025515518153991442 for _ in range(1536)] | |
elif vector1 == 1: | |
vector1 = table[str(list(table.keys())[0])] | |
for key in table.keys(): | |
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key])) | |
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1) | |
print("manhattan dist to 1 saved") | |
#vec_table | |
def sim_search_fly(vec_table, term, debug=False): | |
if debug: | |
print(type(vec_table)) | |
print(type(term)) | |
print(type(vec_table[list(vec_table.keys())[0]])) | |
print("vec table:") | |
print(vec_table[list(vec_table.keys())[5]][:4]) | |
print("search term") | |
print(term[:4]) | |
if type(term) == str: | |
print("str") | |
vector1 = my_new_openai.vectorize_data(term) | |
elif type(term) == list: | |
print("list") | |
vector1 = term | |
else: | |
print("invalid search_term/search_vector format") | |
return | |
sim_search_dict = {} | |
for key in vec_table.keys(): | |
vector2 = vec_table[key] | |
if debug: | |
print("") | |
print(f"{vector1}") | |
print(f"{vector2}") | |
print(f"doing dot product for {key} and {term}") | |
if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]: | |
dp = 200 | |
else: | |
dp = np.dot(vector1, vector2) | |
#distance = np.linalg.norm(np.array(vector1) - np.array(vector2)) | |
if debug: | |
print(f"the dp is {dp}") | |
#print(f"the distance is{distance}") | |
print("") | |
print("") | |
print("") | |
sim_search_dict[key] = dp #* distance | |
# sort with the biggest similarity | |
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True)) | |
if debug: | |
for key, value in sorted_table[:5]: | |
print(f"{key}: {value}") | |
# first_key, first_value = list(sortedTable.items())[0] | |
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}") | |
return sorted_table | |