Spaces:
Runtime error
Runtime error
File size: 5,302 Bytes
0c4e36a fce75da 0c4e36a fce75da 0c4e36a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import my_new_openai
import my_1_writer
import json
import numpy as np
# sim search with dot_product and lin_distance
# the newly vectorized TERM will be added to the database
# database = .json file
def sim_search_load_db(database, term, add_to_db=True, debug=False):
if type(term) == str:
print("str")
vector1 = my_new_openai.vectorize_data(term)
elif type(term) == list:
print("list")
vector1 = term
else:
print("invalid search_term/search_vector format")
return
with open(database, "r") as f:
table = json.load(f)
sim_search_dict = {}
for key in table.keys():
vector2 = table[key]
if debug:
print("")
print(f"{vector1}")
print(f"{vector2}")
print(f"doing dot product for {key} and {term}")
dp = np.dot(vector1, vector2)
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
if debug:
print(f"the dp is {dp}")
print(f"the distance is{distance}")
print("")
print("")
print("")
sim_search_dict[key] = dp * distance
# sort with the biggest similarity
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
if debug:
for key, value in sorted_table[:5]:
print(f"{key}: {value}")
if add_to_db:
if term in table.keys():
print("the search term is in the database!")
# add the newly vectorized term to the words, if not already in the vector table
else:
if database != "session/my_words_vec_table.json":
database = "session/my_vecs.json"
# table = load_df(database) # ??
table[str(term)] = vector1
my_1_writer.safe_my_dict_as_json(database, table)
# first_key, first_value = list(sortedTable.items())[0]
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
return sorted_table
def dot_p_to_1(database, vector1=0, analysis_filename=0):
with open(database, "r") as f:
table = json.load(f)
dot_product_to1 = {}
if vector1 == 0:
vector1 = [0.025515518153991442 for _ in range(1536)]
elif vector1 == 1:
vector1 = table[str(list(table.keys())[0])]
for key in table.keys():
dot_product_to1[key] = np.dot(vector1, table[key])
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
print("dot p to 1 saved")
def lin_dist(database, vector1=0, analysis_filename=0):
with open(database, "r") as f:
table = json.load(f)
lin_dist_to_1 = {}
if vector1 == 0:
vector1 = [0.025515518153991442 for _ in range(1536)]
elif vector1 == 1:
vector1 = table[str(list(table.keys())[0])]
for key in table.keys():
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
print("lin dist to 1 saved")
def manhattan_dist(database, vector1=0, analysis_filename=0):
with open(database, "r") as f:
table = json.load(f)
manhattan_dist_to_1 = {}
if vector1 == 0:
vector1 = [0.025515518153991442 for _ in range(1536)]
elif vector1 == 1:
vector1 = table[str(list(table.keys())[0])]
for key in table.keys():
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
print("manhattan dist to 1 saved")
#vec_table
def sim_search_fly(vec_table, term, debug=False):
if debug:
print(type(vec_table))
print(type(term))
print(type(vec_table[list(vec_table.keys())[0]]))
print("vec table:")
print(vec_table[list(vec_table.keys())[5]][:4])
print("search term")
print(term[:4])
if type(term) == str:
print("str")
vector1 = my_new_openai.vectorize_data(term)
elif type(term) == list:
print("list")
vector1 = term
else:
print("invalid search_term/search_vector format")
return
sim_search_dict = {}
for key in vec_table.keys():
vector2 = vec_table[key]
if debug:
print("")
print(f"{vector1}")
print(f"{vector2}")
print(f"doing dot product for {key} and {term}")
if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
dp = 200
else:
dp = np.dot(vector1, vector2)
#distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
if debug:
print(f"the dp is {dp}")
#print(f"the distance is{distance}")
print("")
print("")
print("")
sim_search_dict[key] = dp #* distance
# sort with the biggest similarity
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))
if debug:
for key, value in sorted_table[:5]:
print(f"{key}: {value}")
# first_key, first_value = list(sortedTable.items())[0]
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
return sorted_table
|