elia-waefler commited on
Commit
8b9aa2c
·
1 Parent(s): 661b653

init files, idea

Browse files
Files changed (6) hide show
  1. .idea/.name +1 -0
  2. my_1_openai.py +59 -0
  3. my_1_reader.py +201 -0
  4. my_1_writer.py +98 -0
  5. my_2_embedder.py +169 -0
  6. my_2_sim_search.py +109 -0
.idea/.name ADDED
@@ -0,0 +1 @@
 
 
1
+ app.py
my_1_openai.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from openai import OpenAI
3
+
4
+
5
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
6
+ openai.organization = os.environ.get("OPENAI_ORG_ID")
7
+ models = {
8
+ "assistant": "You are a helpful assistant.",
9
+ "binary": "you are a maschine that converts questions or prompts to binary outputs. "
10
+ "you can only answer 'yes' or 'no'. if uncertain, default to 'no'."
11
+ }
12
+
13
+
14
+ def gpt4_new(prompt_text):
15
+ client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
16
+ response = client.chat.completions.create(
17
+ model="gpt-4",
18
+ messages=[{"role": "system",
19
+ "content": "Du bist eine Maschine, die Dokumente klassifiziert."},
20
+ {"role": "user", "content": prompt_text}])
21
+ return response.choices[0].message.content
22
+
23
+
24
+
25
+ def gpt4(prompt, model=models["assistant"]):
26
+ response = openai.Completion.create(
27
+ model="gpt-4",
28
+ messages=[
29
+ {"role": "system", "content": model},
30
+ {"role": "user", "content": prompt}
31
+ ]
32
+ )
33
+ return response.choices[0].message['content']
34
+
35
+
36
+ def gpt_bool(prompt):
37
+ """
38
+
39
+ :param prompt: the text prompt
40
+ :return: True or False
41
+ """
42
+ true_values = ["yes", "Yes", "Y", "y", "yes.", "Yes.", "YES"]
43
+ return bool(gpt4(prompt, model=models["binary"]) in true_values)
44
+
45
+
46
+ def vectorize_data(data_input):
47
+ try:
48
+ response = openai.Embedding.create(input=data_input, model="text-embedding-ada-002")
49
+ except openai.error.InvalidRequestError as err:
50
+ print(err)
51
+ return [0, 0, 0]
52
+ return response['data'][0]['embedding']
53
+
54
+
55
+ if __name__ == "__main__":
56
+ print("here are all functions that directly call openai.")
57
+ print("hi, im chatGPT how can I help? ")
58
+ while True:
59
+ print(gpt_bool(input()))
my_1_reader.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MUSS AUFGERÄUMT WERDEN
2
+
3
+ import json
4
+ import os
5
+ import subprocess
6
+ import PyPDF2
7
+ import csv
8
+ import fitz # PyMuPDF
9
+
10
+
11
+ def extract_text_from_pdf(pdf_path):
12
+ """
13
+ Extracts all text from a PDF file.
14
+
15
+ :param pdf_path: Path to the PDF file.
16
+ :return: Extracted text as a string.
17
+ """
18
+ # Open the PDF file
19
+ doc = fitz.open(pdf_path)
20
+
21
+ # Initialize an empty string to hold the text
22
+ text = ''
23
+
24
+ # Iterate through each page in the PDF
25
+ for page_num in range(len(doc)):
26
+ # Get a page
27
+ page = doc.load_page(page_num)
28
+
29
+ # Extract text from the page and add it to the result
30
+ text += page.get_text()
31
+
32
+ # Close the document
33
+ doc.close()
34
+
35
+ return text
36
+
37
+
38
+ def read_pdfs_from_folder(folder_path):
39
+ """
40
+ Reads all PDF files in the specified folder using PdfReader and extracts their text.
41
+
42
+ Parameters:
43
+ - folder_path: The path to the folder containing PDF files.
44
+
45
+ Returns:
46
+ - A dictionary with file names as keys and their extracted text as values.
47
+ """
48
+ pdf_texts = {}
49
+ for filename in os.listdir(folder_path):
50
+ if filename.endswith('.pdf'):
51
+ file_path = os.path.join(folder_path, filename)
52
+ with open(file_path, 'rb') as pdf_file:
53
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
54
+ text = ''
55
+ for page in pdf_reader.pages:
56
+ try:
57
+ text += page.extract_text()
58
+ except UnicodeDecodeError as e:
59
+ print(e)
60
+ for c in text:
61
+ if c in ["ä", "Ä"]:
62
+ text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
63
+ if c in ["ö", "Ö"]:
64
+ text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
65
+ if c in ["ü", "Ü"]:
66
+ text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
67
+ if c in [",", ";", "\\", '"']:
68
+ text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
69
+ if c in ["/n", "\n"]:
70
+ text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
71
+ pdf_texts[filename] = text
72
+ return pdf_texts
73
+
74
+
75
+ def read_csv_lines_as_strings(filename):
76
+ """
77
+ Opens a CSV file and returns each line as a string in a list.
78
+
79
+ Parameters:
80
+ - filename: The path to the CSV file.
81
+
82
+ Returns:
83
+ - A list of strings, each representing a line from the CSV file.
84
+ """
85
+ lines_as_strings = []
86
+ with open(filename, newline='') as csvfile:
87
+ try:
88
+ reader = csv.reader(csvfile)
89
+ for row in reader:
90
+ # Convert the row (a list of values) back into a comma-separated string
91
+ line_as_string = ','.join(row)
92
+ lines_as_strings.append(line_as_string)
93
+ except UnicodeDecodeError as e:
94
+ print(e)
95
+ return lines_as_strings
96
+
97
+
98
+ # Function to load data from JSON files
99
+ def load_data(filename):
100
+ with open(filename, 'r') as file:
101
+ try:
102
+ return json.load(file)
103
+ except UnicodeDecodeError as err:
104
+ print(err)
105
+ return {}
106
+
107
+
108
+ def find_and_open_file(filename, start_directory):
109
+ """
110
+ Attempts to open a file with the given filename starting from the specified directory.
111
+ If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
112
+ """
113
+ for root, dirs, files in os.walk(start_directory):
114
+ if filename in files:
115
+ filepath = os.path.join(root, filename)
116
+ print(f"File found: {filepath}")
117
+ return filepath
118
+ print(f"File {filename} not found.")
119
+ return None
120
+
121
+
122
+ def open_file(filepath):
123
+ """
124
+ Opens the file with the default application, based on the operating system.
125
+ """
126
+ if os.path.exists(filepath):
127
+ if os.name == 'posix': # Linux, macOS, etc.
128
+ subprocess.call(('open', filepath))
129
+ elif os.name == 'nt': # Windows
130
+ os.startfile(filepath)
131
+ else:
132
+ print(f"Cannot open file on this operating system: {filepath}")
133
+ else:
134
+ print(f"File does not exist: {filepath}")
135
+
136
+
137
+ def list_folders_files_recursive(path, depth=0):
138
+ """
139
+ Recursively lists all folders and files within the specified path, including subfolders.
140
+
141
+ Parameters:
142
+ - path: The directory path to list contents from.
143
+ - depth: The current depth of recursion (used for indentation in print statements).
144
+
145
+ Returns:
146
+ - None
147
+ """
148
+ # Ensure the provided path is a directory
149
+ if not os.path.isdir(path):
150
+ print(f"The provided path '{path}' is not a valid directory.")
151
+ return
152
+
153
+ indent = ' ' * depth # Indentation based on recursion depth
154
+ folders, files = [], []
155
+
156
+ # List all entries in the directory
157
+ for entry in os.listdir(path):
158
+ full_path = os.path.join(path, entry)
159
+ if os.path.isdir(full_path):
160
+ folders.append(entry)
161
+ print(f"{indent}Folder: {entry}")
162
+ # Recursively list subfolders and files
163
+ list_folders_files_recursive(full_path, depth + 1)
164
+ elif os.path.isfile(full_path):
165
+ files.append(entry)
166
+
167
+ for f in files:
168
+ print(f"{indent}File: {f}")
169
+
170
+
171
+ def list_folders_files(path):
172
+ """
173
+ Lists all folders and files within the specified path.
174
+
175
+ Parameters:
176
+ - path: The directory path to list contents from.
177
+
178
+ Returns:
179
+ - A tuple of two lists: (folders, files).
180
+ """
181
+ folders = []
182
+ files = []
183
+
184
+ # Ensure the provided path is a directory
185
+ if not os.path.isdir(path):
186
+ print(f"The provided path '{path}' is not a valid directory.")
187
+ return folders, files
188
+
189
+ # List all entries in the directory
190
+ for entry in os.listdir(path):
191
+ full_path = os.path.join(path, entry)
192
+ if os.path.isdir(full_path):
193
+ folders.append(entry)
194
+ elif os.path.isfile(full_path):
195
+ files.append(entry)
196
+
197
+ return folders, files
198
+
199
+
200
+ if __name__ == "__main__":
201
+ print("here are all functions that read files")
my_1_writer.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MUSS AUFGERÄUMT WERDEN
2
+
3
+ import json
4
+ import pandas as pd
5
+
6
+
7
+ def split_json_file(input_filepath, lines_per_file=50):
8
+ """
9
+ Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
10
+
11
+ param input_filepath: The path to the input JSON file.
12
+ param lines_per_file: The maximum number of lines per output file.
13
+ """
14
+ # Counter for file naming
15
+ file_counter = 1
16
+ # Open the input file
17
+ with open(input_filepath, 'r') as input_file:
18
+ # Read the lines from the input file
19
+ lines = input_file.readlines()
20
+ # Iterate through the lines in chunks of 'lines_per_file'
21
+ for i in range(0, len(lines), lines_per_file):
22
+ # Determine the output file name
23
+ output_filename = f'translate_data/english_{file_counter}.json'
24
+ # Write the current chunk to the output file
25
+ with open(output_filename, 'w') as output_file:
26
+ # Grab the current chunk of lines
27
+ chunk = lines[i:i+lines_per_file]
28
+ # Write each line to the output file
29
+ for line in chunk:
30
+ output_file.write(line)
31
+ print(f'Created {output_filename}')
32
+ # Increment the file counter
33
+ file_counter += 1
34
+
35
+
36
+ def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
37
+ """
38
+ Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
39
+ headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
40
+ and saves it as a CSV file.
41
+
42
+ Parameters:
43
+ - list1 (list): First list to merge, contributing to column 'list1'.
44
+ - list2 (list): Second list to merge, contributing to column 'list2'.
45
+ - dict1 (dict): First dictionary to merge, keys and values added as separate columns.
46
+ - dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
47
+ - filename (str): Filename for the saved CSV file.
48
+ """
49
+ # Combining all elements into a structured list of dictionaries for DataFrame construction
50
+ data = []
51
+ dict1_items = list(dict1.items())
52
+ dict2_items = list(dict2.items())
53
+ for i in range(len(list1)):
54
+ row = {
55
+ 'list1': list1[i],
56
+ 'list2': list2[i],
57
+ 'keys dict1': dict1_items[i][0],
58
+ 'vals dict1': dict1_items[i][1],
59
+ 'keys dict2': dict2_items[i][0],
60
+ 'vals dict2': dict2_items[i][1]
61
+ }
62
+ data.append(row)
63
+
64
+ # Creating the DataFrame
65
+ df = pd.DataFrame(data)
66
+
67
+ # Saving the DataFrame to a CSV file
68
+ df.to_csv(filename, index=False)
69
+ print(f"DataFrame saved as '{filename}' in the current directory.")
70
+
71
+
72
+ # new line for every entry
73
+ def safe_my_dict_as_json(file_name, my_dict):
74
+ print(my_dict)
75
+ # Open a file for writing
76
+ with open(file_name, 'w') as f:
77
+ # Write the opening brace of the JSON object
78
+ f.write('{\n')
79
+ # Get total number of items to control comma insertion
80
+ total_items = len(my_dict)
81
+ if type(my_dict) == list:
82
+ my_dict = my_dict[0]
83
+ # Iterate over items, keeping track of the current item index
84
+ for i, (key, value) in enumerate(my_dict.items()):
85
+ # Serialize the key with JSON to handle special characters and ensure proper quoting
86
+ json_key = json.dumps(key)
87
+ # Convert the list to a JSON-formatted string (without indentation)
88
+ json_value = json.dumps(value)
89
+ # Determine if a comma is needed (for all but the last item)
90
+ comma = ',' if i < total_items - 1 else ''
91
+ # Write the formatted string to the file
92
+ f.write(f" {json_key}: {json_value}{comma}\n")
93
+ # Write the closing brace of the JSON object
94
+ f.write('}\n')
95
+
96
+
97
+ if __name__ == "__main__":
98
+ print("here are all functions that write to the Datasets")
my_2_embedder.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import my_1_reader
2
+ import my_1_writer
3
+ import my_1_openai
4
+ import os
5
+ import openai
6
+ import pdf2image
7
+ from pdf2image import convert_from_path
8
+ from PIL import Image
9
+ import csv
10
+ import numpy as np
11
+ import os
12
+ import pdfminer
13
+ from pdf2image import convert_from_path
14
+ import csv
15
+ import numpy as np
16
+ import os
17
+
18
+
19
+ # Assuming your my_1_openai's vectorize functions work as described
20
+ def vectorize_data(data):
21
+ # Replace this with your actual logic to vectorize text data
22
+ return np.random.rand(100).tolist() # Example vector
23
+
24
+
25
+ def vectorize_image(data):
26
+ # Replace this with your actual logic to vectorize image data
27
+ return np.random.rand(100).tolist() # Example vector
28
+
29
+
30
+ def vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
31
+ tensor_description = {
32
+ "my_id": 89, # Example ID, ideally this should be dynamically generated
33
+ "og_name": pdf_path,
34
+ "metadata": {},
35
+ "vec_content_text": [],
36
+ "vec_content_img": []
37
+ }
38
+
39
+ # Read metadata from CSV and match by 'og_name' (pdf_path)
40
+ # 'latin1', 'ISO-8859-1', or 'cp1252'
41
+ with open(metadata_filename, mode='r', encoding='utf-8') as csvfile:
42
+ csv_reader = csv.DictReader(csvfile)
43
+ for row in csv_reader:
44
+ if row["Name"] == os.path.basename(pdf_path): # Assuming 'Name' is a column in your CSV
45
+ tensor_description['metadata'] = row
46
+ break
47
+
48
+ # get text content
49
+ text = my_1_reader.extract_text_from_pdf(pdf_path)
50
+
51
+ # Vectorize extracted text
52
+ if text:
53
+ tensor_description['vec_content_text'].append(vectorize_data(data=text))
54
+
55
+ # Convert PDF pages to images using pdf2image
56
+ images = convert_from_path(pdf_path)
57
+ for img in images:
58
+ # Assume vectorize_image expects a PIL image; pdf2image.convert_from_path already returns PIL images
59
+ img_vector = vectorize_image(data=img)
60
+ tensor_description['vec_content_img'].append(img_vector)
61
+
62
+ # Here, instead of saving the tensor, we'll simply print it as an example
63
+ print(tensor_description)
64
+
65
+ return tensor_description
66
+
67
+
68
+ def vectorize_this_pdf_with_metadata_old(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
69
+ # get PDF content, split into chunks
70
+
71
+ tensor_description = { #sample
72
+ "my_id": 89, #nummerate how often the func has been called
73
+ "og_name": pdf_path,
74
+ "metadata": {"a": 1, "b": 2, "c": 3}, # get from metadata_filename, get the full row, with the same filename in column A
75
+ "vec_content_text": [[0.03874, 0.03947, -0.0875], [-0.03234, 0.03437, -0.011234]], # vectorize all chunks of all the text in the PDF
76
+ # call my_1_openai.vectorize_data(data="string") this function returns the vector from Ada002 as a list
77
+ "vec_content_img": [[0.01234, 0.09875, -0.0542], [-0.02456, 0.03537, -0.016634]]
78
+ # for the images make every pdf into an img using pdf2image
79
+ # call my_1_openai.vecotrize_image(data=PIL_OBJ) this funciton should return the vector, of the image, comparable to text. write this funciton as well.
80
+ }
81
+
82
+ tensor = [] # make tensor from tensor_Description
83
+ return tensor
84
+
85
+
86
+ def vectorize_pdfs(pdf_dict):
87
+ """
88
+ Vectorize a pdf using openai API
89
+
90
+ Parameters:
91
+ - dataset: dictionary containing PDF files.
92
+
93
+ Returns:
94
+ - dictionary containing vectors
95
+ """
96
+ vec_dataset = {}
97
+ for key in pdf_dict.keys():
98
+ try:
99
+ vector = my_1_openai.vectorize_data(pdf_dict[key])
100
+ except openai.error.InvalidRequestError as err:
101
+ print(err)
102
+ vector = [0, 0, 0]
103
+ vec_dataset[key] = str(vector)
104
+ return vec_dataset
105
+
106
+
107
+ def vectorize_csv(csv_table, safe=False):
108
+ folder_name = ""
109
+ if safe:
110
+ folder_name = f"{csv_table}_vectorised/"
111
+ if not os.path.exists(folder_name):
112
+ os.makedirs(folder_name)
113
+ nb = 1
114
+ vec_dataset = []
115
+ for data_item in csv_table:
116
+ vector = my_1_openai.vectorize_data(data_item)
117
+ if safe:
118
+ with open(f"{folder_name}{csv_table}_vec.txt", "w") as f:
119
+ f.write(str(vector) + "\n")
120
+ print("csv_line"+str(nb))
121
+ nb += 1
122
+ vec_dataset.append(str(vector))
123
+ return vec_dataset
124
+
125
+
126
+ def create_df(ds):
127
+ # my_df = {"name": [], "metadata": [], "body_text": []}
128
+ my_df, my_new_df = {}, {}
129
+ my_df["name"] = [filename for filename in os.listdir(ds) if filename.endswith('.pdf')]
130
+ my_df["metadata"] = my_1_reader.read_csv_lines_as_strings(ds + "_metadata.csv")[1:11]
131
+ my_df["text"] = list(my_1_reader.read_pdfs_from_folder(ds).values())
132
+ for e in my_df:
133
+ my_new_df[f"{e}_vec"] = [my_1_openai.vectorize_data(item) for item in my_df[e]]
134
+ for e in my_new_df:
135
+ my_df[str(e)] = my_new_df[e]
136
+ for e in my_df:
137
+ print(f"{e} {my_df[e][2]}")
138
+
139
+
140
+ def create_vec_dataset(folder):
141
+ my_pdfs = my_1_reader.read_pdfs_from_folder(f"{folder}/PDF")
142
+ vectorize_then_safe_data(f"{folder}/vectors//names.json", my_pdfs.keys())
143
+ vectorize_then_safe_data(f"{folder}/vectors//texts.json", my_pdfs.values())
144
+
145
+
146
+ # function to vectorize data=[]. then safes as json.
147
+ def vectorize_then_safe_data(file_name, data):
148
+ my_vec_words = []
149
+ for entry in data:
150
+ my_vec_words.append(my_1_openai.vectorize_data(entry))
151
+ my_dict = dict(zip(data, my_vec_words))
152
+ my_1_writer.safe_my_dict_as_json(file_name, my_dict)
153
+ print("vectorised data saved")
154
+
155
+
156
+ def main():
157
+ # Example call to the function
158
+ pdf_path = 'DS_U3/Dokumente/E - Elektroanlagen/ISB-020-U3-W-E-01-B07005-001-040.pdf'
159
+ output_path = 'DS_U3/Dokumente_vec'
160
+ metadata_filename = 'DS_U3/U3_Metadaten.csv'
161
+ vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename)
162
+
163
+ if __name__ == "__main__":
164
+ print("this file contains embedding functions")
165
+ vec1 = vectorize_data("this is the test string")
166
+ vec2 = vectorize_data("this is the test string")
167
+ if vec1 == vec2:
168
+ print("same")
169
+
my_2_sim_search.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import my_1_openai
2
+ import my_1_writer
3
+ import json
4
+ import numpy as np
5
+
6
+
7
+ # sim search with dot_product and lin_distance
8
+ # the newly vectorized TERM will be added to the database
9
+ def sim_search(database, term, add_to_db=True, debug=False):
10
+ if type(term) == str:
11
+ print("str")
12
+ vector1 = my_1_openai.vectorize_data(term)
13
+ elif type(term) == list:
14
+ print("list")
15
+ vector1 = term
16
+ else:
17
+ print("invalid search_term/search_vector format")
18
+ return
19
+ with open(database, "r") as f:
20
+ table = json.load(f)
21
+ sim_search_dict = {}
22
+ for key in table.keys():
23
+ vector2 = table[key]
24
+ if debug:
25
+ print("")
26
+ print(f"{vector1}")
27
+ print(f"{vector2}")
28
+ print(f"doing dot product for {key} and {term}")
29
+ dp = np.dot(vector1, vector2)
30
+ distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
31
+ if debug:
32
+ print(f"the dp is {dp}")
33
+ print(f"the distance is{distance}")
34
+ print("")
35
+ print("")
36
+ print("")
37
+ sim_search_dict[key] = dp * distance
38
+
39
+ # sort with the biggest similarity
40
+ sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
41
+
42
+ if debug:
43
+ for key, value in sorted_table[:5]:
44
+ print(f"{key}: {value}")
45
+ if add_to_db:
46
+
47
+ if term in table.keys():
48
+ print("the search term is in the database!")
49
+ # add the newly vectorized term to the words, if not already in the vector table
50
+ else:
51
+ if database != "session/my_words_vec_table.json":
52
+ database = "session/my_vecs.json"
53
+ # table = load_df(database) # ??
54
+ table[str(term)] = vector1
55
+ my_1_writer.safe_my_dict_as_json(database, table)
56
+ # first_key, first_value = list(sortedTable.items())[0]
57
+ print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
58
+ return sorted_table
59
+
60
+
61
+ def dot_p_to_1(database, vector1=0, analysis_filename=0):
62
+
63
+ with open(database, "r") as f:
64
+ table = json.load(f)
65
+ dot_product_to1 = {}
66
+
67
+ if vector1 == 0:
68
+ vector1 = [0.025515518153991442 for _ in range(1536)]
69
+ elif vector1 == 1:
70
+ vector1 = table[str(list(table.keys())[0])]
71
+
72
+ for key in table.keys():
73
+ dot_product_to1[key] = np.dot(vector1, table[key])
74
+ my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
75
+ print("dot p to 1 saved")
76
+
77
+
78
+ def lin_dist(database, vector1=0, analysis_filename=0):
79
+ with open(database, "r") as f:
80
+ table = json.load(f)
81
+ lin_dist_to_1 = {}
82
+
83
+ if vector1 == 0:
84
+ vector1 = [0.025515518153991442 for _ in range(1536)]
85
+ elif vector1 == 1:
86
+ vector1 = table[str(list(table.keys())[0])]
87
+
88
+ for key in table.keys():
89
+ lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
90
+
91
+ my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
92
+ print("lin dist to 1 saved")
93
+
94
+
95
+ def manhattan_dist(database, vector1=0, analysis_filename=0):
96
+ with open(database, "r") as f:
97
+ table = json.load(f)
98
+ manhattan_dist_to_1 = {}
99
+
100
+ if vector1 == 0:
101
+ vector1 = [0.025515518153991442 for _ in range(1536)]
102
+ elif vector1 == 1:
103
+ vector1 = table[str(list(table.keys())[0])]
104
+
105
+ for key in table.keys():
106
+ manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
107
+
108
+ my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
109
+ print("manhattan dist to 1 saved")