File size: 5,302 Bytes
0c4e36a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fce75da
0c4e36a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fce75da
0c4e36a
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import my_new_openai
import my_1_writer
import json
import numpy as np


# sim search with dot_product and lin_distance
# the newly vectorized TERM will be added to the database
# database = .json file
def sim_search_load_db(database, term, add_to_db=True, debug=False):
    if type(term) == str:
        print("str")
        vector1 = my_new_openai.vectorize_data(term)
    elif type(term) == list:
        print("list")
        vector1 = term
    else:
        print("invalid search_term/search_vector format")
        return
    with open(database, "r") as f:
        table = json.load(f)
    sim_search_dict = {}
    for key in table.keys():
        vector2 = table[key]
        if debug:
            print("")
            print(f"{vector1}")
            print(f"{vector2}")
            print(f"doing dot product for {key} and {term}")
        dp = np.dot(vector1, vector2)
        distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
        if debug:
            print(f"the dp is {dp}")
            print(f"the distance is{distance}")
            print("")
            print("")
            print("")
        sim_search_dict[key] = dp * distance

    # sort with the biggest similarity
    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))

    if debug:
        for key, value in sorted_table[:5]:
            print(f"{key}: {value}")
    if add_to_db:

        if term in table.keys():
            print("the search term is in the database!")
            # add the newly vectorized term to the words, if not already in the vector table
        else:
            if database != "session/my_words_vec_table.json":
                database = "session/my_vecs.json"
                # table = load_df(database)  # ??
            table[str(term)] = vector1
            my_1_writer.safe_my_dict_as_json(database, table)
    # first_key, first_value = list(sortedTable.items())[0]
    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
    return sorted_table


def dot_p_to_1(database, vector1=0, analysis_filename=0):

    with open(database, "r") as f:
        table = json.load(f)
    dot_product_to1 = {}

    if vector1 == 0:
        vector1 = [0.025515518153991442 for _ in range(1536)]
    elif vector1 == 1:
        vector1 = table[str(list(table.keys())[0])]

    for key in table.keys():
        dot_product_to1[key] = np.dot(vector1, table[key])
    my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
    print("dot p to 1 saved")


def lin_dist(database, vector1=0, analysis_filename=0):
    with open(database, "r") as f:
        table = json.load(f)
    lin_dist_to_1 = {}

    if vector1 == 0:
        vector1 = [0.025515518153991442 for _ in range(1536)]
    elif vector1 == 1:
        vector1 = table[str(list(table.keys())[0])]

    for key in table.keys():
        lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))

    my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
    print("lin dist to 1 saved")


def manhattan_dist(database, vector1=0, analysis_filename=0):
    with open(database, "r") as f:
        table = json.load(f)
    manhattan_dist_to_1 = {}

    if vector1 == 0:
        vector1 = [0.025515518153991442 for _ in range(1536)]
    elif vector1 == 1:
        vector1 = table[str(list(table.keys())[0])]

    for key in table.keys():
        manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))

    my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
    print("manhattan dist to 1 saved")


#vec_table
def sim_search_fly(vec_table, term, debug=False):
    if debug:
        print(type(vec_table))
        print(type(term))
        print(type(vec_table[list(vec_table.keys())[0]]))
        print("vec table:")
    print(vec_table[list(vec_table.keys())[5]][:4])
    print("search term")
    print(term[:4])
    if type(term) == str:
        print("str")
        vector1 = my_new_openai.vectorize_data(term)
    elif type(term) == list:
        print("list")
        vector1 = term
    else:
        print("invalid search_term/search_vector format")
        return

    sim_search_dict = {}
    for key in vec_table.keys():
        vector2 = vec_table[key]
        if debug:
            print("")
            print(f"{vector1}")
            print(f"{vector2}")
            print(f"doing dot product for {key} and {term}")
        if vector2[0] == vector2[1] and vector2[3] == vector2[4] and vector2[5] == vector2[6]:
            dp = 200
        else:
            dp = np.dot(vector1, vector2)
        #distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
        if debug:
            print(f"the dp is {dp}")
            #print(f"the distance is{distance}")
            print("")
            print("")
            print("")
        sim_search_dict[key] = dp #* distance

    # sort with the biggest similarity
    sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1], reverse=True))

    if debug:
        for key, value in sorted_table[:5]:
            print(f"{key}: {value}")

    # first_key, first_value = list(sortedTable.items())[0]
    print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
    return sorted_table