Spaces:
No application file
No application file
elia-waefler
commited on
Commit
·
8b9aa2c
1
Parent(s):
661b653
init files, idea
Browse files- .idea/.name +1 -0
- my_1_openai.py +59 -0
- my_1_reader.py +201 -0
- my_1_writer.py +98 -0
- my_2_embedder.py +169 -0
- my_2_sim_search.py +109 -0
.idea/.name
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
app.py
|
my_1_openai.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from openai import OpenAI
|
3 |
+
|
4 |
+
|
5 |
+
openai.api_key = os.environ.get("OPENAI_API_KEY")
|
6 |
+
openai.organization = os.environ.get("OPENAI_ORG_ID")
|
7 |
+
models = {
|
8 |
+
"assistant": "You are a helpful assistant.",
|
9 |
+
"binary": "you are a maschine that converts questions or prompts to binary outputs. "
|
10 |
+
"you can only answer 'yes' or 'no'. if uncertain, default to 'no'."
|
11 |
+
}
|
12 |
+
|
13 |
+
|
14 |
+
def gpt4_new(prompt_text):
|
15 |
+
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
|
16 |
+
response = client.chat.completions.create(
|
17 |
+
model="gpt-4",
|
18 |
+
messages=[{"role": "system",
|
19 |
+
"content": "Du bist eine Maschine, die Dokumente klassifiziert."},
|
20 |
+
{"role": "user", "content": prompt_text}])
|
21 |
+
return response.choices[0].message.content
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
def gpt4(prompt, model=models["assistant"]):
|
26 |
+
response = openai.Completion.create(
|
27 |
+
model="gpt-4",
|
28 |
+
messages=[
|
29 |
+
{"role": "system", "content": model},
|
30 |
+
{"role": "user", "content": prompt}
|
31 |
+
]
|
32 |
+
)
|
33 |
+
return response.choices[0].message['content']
|
34 |
+
|
35 |
+
|
36 |
+
def gpt_bool(prompt):
|
37 |
+
"""
|
38 |
+
|
39 |
+
:param prompt: the text prompt
|
40 |
+
:return: True or False
|
41 |
+
"""
|
42 |
+
true_values = ["yes", "Yes", "Y", "y", "yes.", "Yes.", "YES"]
|
43 |
+
return bool(gpt4(prompt, model=models["binary"]) in true_values)
|
44 |
+
|
45 |
+
|
46 |
+
def vectorize_data(data_input):
|
47 |
+
try:
|
48 |
+
response = openai.Embedding.create(input=data_input, model="text-embedding-ada-002")
|
49 |
+
except openai.error.InvalidRequestError as err:
|
50 |
+
print(err)
|
51 |
+
return [0, 0, 0]
|
52 |
+
return response['data'][0]['embedding']
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
print("here are all functions that directly call openai.")
|
57 |
+
print("hi, im chatGPT how can I help? ")
|
58 |
+
while True:
|
59 |
+
print(gpt_bool(input()))
|
my_1_reader.py
ADDED
@@ -0,0 +1,201 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MUSS AUFGERÄUMT WERDEN
|
2 |
+
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import subprocess
|
6 |
+
import PyPDF2
|
7 |
+
import csv
|
8 |
+
import fitz # PyMuPDF
|
9 |
+
|
10 |
+
|
11 |
+
def extract_text_from_pdf(pdf_path):
|
12 |
+
"""
|
13 |
+
Extracts all text from a PDF file.
|
14 |
+
|
15 |
+
:param pdf_path: Path to the PDF file.
|
16 |
+
:return: Extracted text as a string.
|
17 |
+
"""
|
18 |
+
# Open the PDF file
|
19 |
+
doc = fitz.open(pdf_path)
|
20 |
+
|
21 |
+
# Initialize an empty string to hold the text
|
22 |
+
text = ''
|
23 |
+
|
24 |
+
# Iterate through each page in the PDF
|
25 |
+
for page_num in range(len(doc)):
|
26 |
+
# Get a page
|
27 |
+
page = doc.load_page(page_num)
|
28 |
+
|
29 |
+
# Extract text from the page and add it to the result
|
30 |
+
text += page.get_text()
|
31 |
+
|
32 |
+
# Close the document
|
33 |
+
doc.close()
|
34 |
+
|
35 |
+
return text
|
36 |
+
|
37 |
+
|
38 |
+
def read_pdfs_from_folder(folder_path):
|
39 |
+
"""
|
40 |
+
Reads all PDF files in the specified folder using PdfReader and extracts their text.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
- folder_path: The path to the folder containing PDF files.
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
- A dictionary with file names as keys and their extracted text as values.
|
47 |
+
"""
|
48 |
+
pdf_texts = {}
|
49 |
+
for filename in os.listdir(folder_path):
|
50 |
+
if filename.endswith('.pdf'):
|
51 |
+
file_path = os.path.join(folder_path, filename)
|
52 |
+
with open(file_path, 'rb') as pdf_file:
|
53 |
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
54 |
+
text = ''
|
55 |
+
for page in pdf_reader.pages:
|
56 |
+
try:
|
57 |
+
text += page.extract_text()
|
58 |
+
except UnicodeDecodeError as e:
|
59 |
+
print(e)
|
60 |
+
for c in text:
|
61 |
+
if c in ["ä", "Ä"]:
|
62 |
+
text = text[:text.index(c)] + "ae" + text[text.index(c)+1:]
|
63 |
+
if c in ["ö", "Ö"]:
|
64 |
+
text = text[:text.index(c)] + "oe" + text[text.index(c)+1:]
|
65 |
+
if c in ["ü", "Ü"]:
|
66 |
+
text = text[:text.index(c)] + "ue" + text[text.index(c)+1:]
|
67 |
+
if c in [",", ";", "\\", '"']:
|
68 |
+
text = text[:text.index(c)] + "_" + text[text.index(c)+1:]
|
69 |
+
if c in ["/n", "\n"]:
|
70 |
+
text = text[:text.index(c)] + "<newline>" + text[text.index(c) + 1:]
|
71 |
+
pdf_texts[filename] = text
|
72 |
+
return pdf_texts
|
73 |
+
|
74 |
+
|
75 |
+
def read_csv_lines_as_strings(filename):
|
76 |
+
"""
|
77 |
+
Opens a CSV file and returns each line as a string in a list.
|
78 |
+
|
79 |
+
Parameters:
|
80 |
+
- filename: The path to the CSV file.
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
- A list of strings, each representing a line from the CSV file.
|
84 |
+
"""
|
85 |
+
lines_as_strings = []
|
86 |
+
with open(filename, newline='') as csvfile:
|
87 |
+
try:
|
88 |
+
reader = csv.reader(csvfile)
|
89 |
+
for row in reader:
|
90 |
+
# Convert the row (a list of values) back into a comma-separated string
|
91 |
+
line_as_string = ','.join(row)
|
92 |
+
lines_as_strings.append(line_as_string)
|
93 |
+
except UnicodeDecodeError as e:
|
94 |
+
print(e)
|
95 |
+
return lines_as_strings
|
96 |
+
|
97 |
+
|
98 |
+
# Function to load data from JSON files
|
99 |
+
def load_data(filename):
|
100 |
+
with open(filename, 'r') as file:
|
101 |
+
try:
|
102 |
+
return json.load(file)
|
103 |
+
except UnicodeDecodeError as err:
|
104 |
+
print(err)
|
105 |
+
return {}
|
106 |
+
|
107 |
+
|
108 |
+
def find_and_open_file(filename, start_directory):
|
109 |
+
"""
|
110 |
+
Attempts to open a file with the given filename starting from the specified directory.
|
111 |
+
If the file is not found, searches recursively in all subfolders. Works across macOS, Linux, and Windows.
|
112 |
+
"""
|
113 |
+
for root, dirs, files in os.walk(start_directory):
|
114 |
+
if filename in files:
|
115 |
+
filepath = os.path.join(root, filename)
|
116 |
+
print(f"File found: {filepath}")
|
117 |
+
return filepath
|
118 |
+
print(f"File {filename} not found.")
|
119 |
+
return None
|
120 |
+
|
121 |
+
|
122 |
+
def open_file(filepath):
|
123 |
+
"""
|
124 |
+
Opens the file with the default application, based on the operating system.
|
125 |
+
"""
|
126 |
+
if os.path.exists(filepath):
|
127 |
+
if os.name == 'posix': # Linux, macOS, etc.
|
128 |
+
subprocess.call(('open', filepath))
|
129 |
+
elif os.name == 'nt': # Windows
|
130 |
+
os.startfile(filepath)
|
131 |
+
else:
|
132 |
+
print(f"Cannot open file on this operating system: {filepath}")
|
133 |
+
else:
|
134 |
+
print(f"File does not exist: {filepath}")
|
135 |
+
|
136 |
+
|
137 |
+
def list_folders_files_recursive(path, depth=0):
|
138 |
+
"""
|
139 |
+
Recursively lists all folders and files within the specified path, including subfolders.
|
140 |
+
|
141 |
+
Parameters:
|
142 |
+
- path: The directory path to list contents from.
|
143 |
+
- depth: The current depth of recursion (used for indentation in print statements).
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
- None
|
147 |
+
"""
|
148 |
+
# Ensure the provided path is a directory
|
149 |
+
if not os.path.isdir(path):
|
150 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
151 |
+
return
|
152 |
+
|
153 |
+
indent = ' ' * depth # Indentation based on recursion depth
|
154 |
+
folders, files = [], []
|
155 |
+
|
156 |
+
# List all entries in the directory
|
157 |
+
for entry in os.listdir(path):
|
158 |
+
full_path = os.path.join(path, entry)
|
159 |
+
if os.path.isdir(full_path):
|
160 |
+
folders.append(entry)
|
161 |
+
print(f"{indent}Folder: {entry}")
|
162 |
+
# Recursively list subfolders and files
|
163 |
+
list_folders_files_recursive(full_path, depth + 1)
|
164 |
+
elif os.path.isfile(full_path):
|
165 |
+
files.append(entry)
|
166 |
+
|
167 |
+
for f in files:
|
168 |
+
print(f"{indent}File: {f}")
|
169 |
+
|
170 |
+
|
171 |
+
def list_folders_files(path):
|
172 |
+
"""
|
173 |
+
Lists all folders and files within the specified path.
|
174 |
+
|
175 |
+
Parameters:
|
176 |
+
- path: The directory path to list contents from.
|
177 |
+
|
178 |
+
Returns:
|
179 |
+
- A tuple of two lists: (folders, files).
|
180 |
+
"""
|
181 |
+
folders = []
|
182 |
+
files = []
|
183 |
+
|
184 |
+
# Ensure the provided path is a directory
|
185 |
+
if not os.path.isdir(path):
|
186 |
+
print(f"The provided path '{path}' is not a valid directory.")
|
187 |
+
return folders, files
|
188 |
+
|
189 |
+
# List all entries in the directory
|
190 |
+
for entry in os.listdir(path):
|
191 |
+
full_path = os.path.join(path, entry)
|
192 |
+
if os.path.isdir(full_path):
|
193 |
+
folders.append(entry)
|
194 |
+
elif os.path.isfile(full_path):
|
195 |
+
files.append(entry)
|
196 |
+
|
197 |
+
return folders, files
|
198 |
+
|
199 |
+
|
200 |
+
if __name__ == "__main__":
|
201 |
+
print("here are all functions that read files")
|
my_1_writer.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# MUSS AUFGERÄUMT WERDEN
|
2 |
+
|
3 |
+
import json
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
def split_json_file(input_filepath, lines_per_file=50):
|
8 |
+
"""
|
9 |
+
Splits a JSON file into multiple files, each containing up to 'lines_per_file' lines.
|
10 |
+
|
11 |
+
param input_filepath: The path to the input JSON file.
|
12 |
+
param lines_per_file: The maximum number of lines per output file.
|
13 |
+
"""
|
14 |
+
# Counter for file naming
|
15 |
+
file_counter = 1
|
16 |
+
# Open the input file
|
17 |
+
with open(input_filepath, 'r') as input_file:
|
18 |
+
# Read the lines from the input file
|
19 |
+
lines = input_file.readlines()
|
20 |
+
# Iterate through the lines in chunks of 'lines_per_file'
|
21 |
+
for i in range(0, len(lines), lines_per_file):
|
22 |
+
# Determine the output file name
|
23 |
+
output_filename = f'translate_data/english_{file_counter}.json'
|
24 |
+
# Write the current chunk to the output file
|
25 |
+
with open(output_filename, 'w') as output_file:
|
26 |
+
# Grab the current chunk of lines
|
27 |
+
chunk = lines[i:i+lines_per_file]
|
28 |
+
# Write each line to the output file
|
29 |
+
for line in chunk:
|
30 |
+
output_file.write(line)
|
31 |
+
print(f'Created {output_filename}')
|
32 |
+
# Increment the file counter
|
33 |
+
file_counter += 1
|
34 |
+
|
35 |
+
|
36 |
+
def merge_and_save(list1, list2, dict1, dict2, filename='output.csv'):
|
37 |
+
"""
|
38 |
+
Merges two lists and two dictionaries into a pandas DataFrame according to the specified structure:
|
39 |
+
headers: ['list1', 'list2', 'keys dict1', 'vals dict1', 'keys dict2', 'vals dict2']
|
40 |
+
and saves it as a CSV file.
|
41 |
+
|
42 |
+
Parameters:
|
43 |
+
- list1 (list): First list to merge, contributing to column 'list1'.
|
44 |
+
- list2 (list): Second list to merge, contributing to column 'list2'.
|
45 |
+
- dict1 (dict): First dictionary to merge, keys and values added as separate columns.
|
46 |
+
- dict2 (dict): Second dictionary to merge, keys and values added as separate columns.
|
47 |
+
- filename (str): Filename for the saved CSV file.
|
48 |
+
"""
|
49 |
+
# Combining all elements into a structured list of dictionaries for DataFrame construction
|
50 |
+
data = []
|
51 |
+
dict1_items = list(dict1.items())
|
52 |
+
dict2_items = list(dict2.items())
|
53 |
+
for i in range(len(list1)):
|
54 |
+
row = {
|
55 |
+
'list1': list1[i],
|
56 |
+
'list2': list2[i],
|
57 |
+
'keys dict1': dict1_items[i][0],
|
58 |
+
'vals dict1': dict1_items[i][1],
|
59 |
+
'keys dict2': dict2_items[i][0],
|
60 |
+
'vals dict2': dict2_items[i][1]
|
61 |
+
}
|
62 |
+
data.append(row)
|
63 |
+
|
64 |
+
# Creating the DataFrame
|
65 |
+
df = pd.DataFrame(data)
|
66 |
+
|
67 |
+
# Saving the DataFrame to a CSV file
|
68 |
+
df.to_csv(filename, index=False)
|
69 |
+
print(f"DataFrame saved as '{filename}' in the current directory.")
|
70 |
+
|
71 |
+
|
72 |
+
# new line for every entry
|
73 |
+
def safe_my_dict_as_json(file_name, my_dict):
|
74 |
+
print(my_dict)
|
75 |
+
# Open a file for writing
|
76 |
+
with open(file_name, 'w') as f:
|
77 |
+
# Write the opening brace of the JSON object
|
78 |
+
f.write('{\n')
|
79 |
+
# Get total number of items to control comma insertion
|
80 |
+
total_items = len(my_dict)
|
81 |
+
if type(my_dict) == list:
|
82 |
+
my_dict = my_dict[0]
|
83 |
+
# Iterate over items, keeping track of the current item index
|
84 |
+
for i, (key, value) in enumerate(my_dict.items()):
|
85 |
+
# Serialize the key with JSON to handle special characters and ensure proper quoting
|
86 |
+
json_key = json.dumps(key)
|
87 |
+
# Convert the list to a JSON-formatted string (without indentation)
|
88 |
+
json_value = json.dumps(value)
|
89 |
+
# Determine if a comma is needed (for all but the last item)
|
90 |
+
comma = ',' if i < total_items - 1 else ''
|
91 |
+
# Write the formatted string to the file
|
92 |
+
f.write(f" {json_key}: {json_value}{comma}\n")
|
93 |
+
# Write the closing brace of the JSON object
|
94 |
+
f.write('}\n')
|
95 |
+
|
96 |
+
|
97 |
+
if __name__ == "__main__":
|
98 |
+
print("here are all functions that write to the Datasets")
|
my_2_embedder.py
ADDED
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import my_1_reader
|
2 |
+
import my_1_writer
|
3 |
+
import my_1_openai
|
4 |
+
import os
|
5 |
+
import openai
|
6 |
+
import pdf2image
|
7 |
+
from pdf2image import convert_from_path
|
8 |
+
from PIL import Image
|
9 |
+
import csv
|
10 |
+
import numpy as np
|
11 |
+
import os
|
12 |
+
import pdfminer
|
13 |
+
from pdf2image import convert_from_path
|
14 |
+
import csv
|
15 |
+
import numpy as np
|
16 |
+
import os
|
17 |
+
|
18 |
+
|
19 |
+
# Assuming your my_1_openai's vectorize functions work as described
|
20 |
+
def vectorize_data(data):
|
21 |
+
# Replace this with your actual logic to vectorize text data
|
22 |
+
return np.random.rand(100).tolist() # Example vector
|
23 |
+
|
24 |
+
|
25 |
+
def vectorize_image(data):
|
26 |
+
# Replace this with your actual logic to vectorize image data
|
27 |
+
return np.random.rand(100).tolist() # Example vector
|
28 |
+
|
29 |
+
|
30 |
+
def vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
|
31 |
+
tensor_description = {
|
32 |
+
"my_id": 89, # Example ID, ideally this should be dynamically generated
|
33 |
+
"og_name": pdf_path,
|
34 |
+
"metadata": {},
|
35 |
+
"vec_content_text": [],
|
36 |
+
"vec_content_img": []
|
37 |
+
}
|
38 |
+
|
39 |
+
# Read metadata from CSV and match by 'og_name' (pdf_path)
|
40 |
+
# 'latin1', 'ISO-8859-1', or 'cp1252'
|
41 |
+
with open(metadata_filename, mode='r', encoding='utf-8') as csvfile:
|
42 |
+
csv_reader = csv.DictReader(csvfile)
|
43 |
+
for row in csv_reader:
|
44 |
+
if row["Name"] == os.path.basename(pdf_path): # Assuming 'Name' is a column in your CSV
|
45 |
+
tensor_description['metadata'] = row
|
46 |
+
break
|
47 |
+
|
48 |
+
# get text content
|
49 |
+
text = my_1_reader.extract_text_from_pdf(pdf_path)
|
50 |
+
|
51 |
+
# Vectorize extracted text
|
52 |
+
if text:
|
53 |
+
tensor_description['vec_content_text'].append(vectorize_data(data=text))
|
54 |
+
|
55 |
+
# Convert PDF pages to images using pdf2image
|
56 |
+
images = convert_from_path(pdf_path)
|
57 |
+
for img in images:
|
58 |
+
# Assume vectorize_image expects a PIL image; pdf2image.convert_from_path already returns PIL images
|
59 |
+
img_vector = vectorize_image(data=img)
|
60 |
+
tensor_description['vec_content_img'].append(img_vector)
|
61 |
+
|
62 |
+
# Here, instead of saving the tensor, we'll simply print it as an example
|
63 |
+
print(tensor_description)
|
64 |
+
|
65 |
+
return tensor_description
|
66 |
+
|
67 |
+
|
68 |
+
def vectorize_this_pdf_with_metadata_old(pdf_path, output_path, metadata_filename="DS_U3/U3_Metadaten.csv"):
|
69 |
+
# get PDF content, split into chunks
|
70 |
+
|
71 |
+
tensor_description = { #sample
|
72 |
+
"my_id": 89, #nummerate how often the func has been called
|
73 |
+
"og_name": pdf_path,
|
74 |
+
"metadata": {"a": 1, "b": 2, "c": 3}, # get from metadata_filename, get the full row, with the same filename in column A
|
75 |
+
"vec_content_text": [[0.03874, 0.03947, -0.0875], [-0.03234, 0.03437, -0.011234]], # vectorize all chunks of all the text in the PDF
|
76 |
+
# call my_1_openai.vectorize_data(data="string") this function returns the vector from Ada002 as a list
|
77 |
+
"vec_content_img": [[0.01234, 0.09875, -0.0542], [-0.02456, 0.03537, -0.016634]]
|
78 |
+
# for the images make every pdf into an img using pdf2image
|
79 |
+
# call my_1_openai.vecotrize_image(data=PIL_OBJ) this funciton should return the vector, of the image, comparable to text. write this funciton as well.
|
80 |
+
}
|
81 |
+
|
82 |
+
tensor = [] # make tensor from tensor_Description
|
83 |
+
return tensor
|
84 |
+
|
85 |
+
|
86 |
+
def vectorize_pdfs(pdf_dict):
|
87 |
+
"""
|
88 |
+
Vectorize a pdf using openai API
|
89 |
+
|
90 |
+
Parameters:
|
91 |
+
- dataset: dictionary containing PDF files.
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
- dictionary containing vectors
|
95 |
+
"""
|
96 |
+
vec_dataset = {}
|
97 |
+
for key in pdf_dict.keys():
|
98 |
+
try:
|
99 |
+
vector = my_1_openai.vectorize_data(pdf_dict[key])
|
100 |
+
except openai.error.InvalidRequestError as err:
|
101 |
+
print(err)
|
102 |
+
vector = [0, 0, 0]
|
103 |
+
vec_dataset[key] = str(vector)
|
104 |
+
return vec_dataset
|
105 |
+
|
106 |
+
|
107 |
+
def vectorize_csv(csv_table, safe=False):
|
108 |
+
folder_name = ""
|
109 |
+
if safe:
|
110 |
+
folder_name = f"{csv_table}_vectorised/"
|
111 |
+
if not os.path.exists(folder_name):
|
112 |
+
os.makedirs(folder_name)
|
113 |
+
nb = 1
|
114 |
+
vec_dataset = []
|
115 |
+
for data_item in csv_table:
|
116 |
+
vector = my_1_openai.vectorize_data(data_item)
|
117 |
+
if safe:
|
118 |
+
with open(f"{folder_name}{csv_table}_vec.txt", "w") as f:
|
119 |
+
f.write(str(vector) + "\n")
|
120 |
+
print("csv_line"+str(nb))
|
121 |
+
nb += 1
|
122 |
+
vec_dataset.append(str(vector))
|
123 |
+
return vec_dataset
|
124 |
+
|
125 |
+
|
126 |
+
def create_df(ds):
|
127 |
+
# my_df = {"name": [], "metadata": [], "body_text": []}
|
128 |
+
my_df, my_new_df = {}, {}
|
129 |
+
my_df["name"] = [filename for filename in os.listdir(ds) if filename.endswith('.pdf')]
|
130 |
+
my_df["metadata"] = my_1_reader.read_csv_lines_as_strings(ds + "_metadata.csv")[1:11]
|
131 |
+
my_df["text"] = list(my_1_reader.read_pdfs_from_folder(ds).values())
|
132 |
+
for e in my_df:
|
133 |
+
my_new_df[f"{e}_vec"] = [my_1_openai.vectorize_data(item) for item in my_df[e]]
|
134 |
+
for e in my_new_df:
|
135 |
+
my_df[str(e)] = my_new_df[e]
|
136 |
+
for e in my_df:
|
137 |
+
print(f"{e} {my_df[e][2]}")
|
138 |
+
|
139 |
+
|
140 |
+
def create_vec_dataset(folder):
|
141 |
+
my_pdfs = my_1_reader.read_pdfs_from_folder(f"{folder}/PDF")
|
142 |
+
vectorize_then_safe_data(f"{folder}/vectors//names.json", my_pdfs.keys())
|
143 |
+
vectorize_then_safe_data(f"{folder}/vectors//texts.json", my_pdfs.values())
|
144 |
+
|
145 |
+
|
146 |
+
# function to vectorize data=[]. then safes as json.
|
147 |
+
def vectorize_then_safe_data(file_name, data):
|
148 |
+
my_vec_words = []
|
149 |
+
for entry in data:
|
150 |
+
my_vec_words.append(my_1_openai.vectorize_data(entry))
|
151 |
+
my_dict = dict(zip(data, my_vec_words))
|
152 |
+
my_1_writer.safe_my_dict_as_json(file_name, my_dict)
|
153 |
+
print("vectorised data saved")
|
154 |
+
|
155 |
+
|
156 |
+
def main():
|
157 |
+
# Example call to the function
|
158 |
+
pdf_path = 'DS_U3/Dokumente/E - Elektroanlagen/ISB-020-U3-W-E-01-B07005-001-040.pdf'
|
159 |
+
output_path = 'DS_U3/Dokumente_vec'
|
160 |
+
metadata_filename = 'DS_U3/U3_Metadaten.csv'
|
161 |
+
vectorize_this_pdf_with_metadata(pdf_path, output_path, metadata_filename)
|
162 |
+
|
163 |
+
if __name__ == "__main__":
|
164 |
+
print("this file contains embedding functions")
|
165 |
+
vec1 = vectorize_data("this is the test string")
|
166 |
+
vec2 = vectorize_data("this is the test string")
|
167 |
+
if vec1 == vec2:
|
168 |
+
print("same")
|
169 |
+
|
my_2_sim_search.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import my_1_openai
|
2 |
+
import my_1_writer
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
# sim search with dot_product and lin_distance
|
8 |
+
# the newly vectorized TERM will be added to the database
|
9 |
+
def sim_search(database, term, add_to_db=True, debug=False):
|
10 |
+
if type(term) == str:
|
11 |
+
print("str")
|
12 |
+
vector1 = my_1_openai.vectorize_data(term)
|
13 |
+
elif type(term) == list:
|
14 |
+
print("list")
|
15 |
+
vector1 = term
|
16 |
+
else:
|
17 |
+
print("invalid search_term/search_vector format")
|
18 |
+
return
|
19 |
+
with open(database, "r") as f:
|
20 |
+
table = json.load(f)
|
21 |
+
sim_search_dict = {}
|
22 |
+
for key in table.keys():
|
23 |
+
vector2 = table[key]
|
24 |
+
if debug:
|
25 |
+
print("")
|
26 |
+
print(f"{vector1}")
|
27 |
+
print(f"{vector2}")
|
28 |
+
print(f"doing dot product for {key} and {term}")
|
29 |
+
dp = np.dot(vector1, vector2)
|
30 |
+
distance = np.linalg.norm(np.array(vector1) - np.array(vector2))
|
31 |
+
if debug:
|
32 |
+
print(f"the dp is {dp}")
|
33 |
+
print(f"the distance is{distance}")
|
34 |
+
print("")
|
35 |
+
print("")
|
36 |
+
print("")
|
37 |
+
sim_search_dict[key] = dp * distance
|
38 |
+
|
39 |
+
# sort with the biggest similarity
|
40 |
+
sorted_table = dict(sorted(sim_search_dict.items(), key=lambda item: item[1]), reversed=True)
|
41 |
+
|
42 |
+
if debug:
|
43 |
+
for key, value in sorted_table[:5]:
|
44 |
+
print(f"{key}: {value}")
|
45 |
+
if add_to_db:
|
46 |
+
|
47 |
+
if term in table.keys():
|
48 |
+
print("the search term is in the database!")
|
49 |
+
# add the newly vectorized term to the words, if not already in the vector table
|
50 |
+
else:
|
51 |
+
if database != "session/my_words_vec_table.json":
|
52 |
+
database = "session/my_vecs.json"
|
53 |
+
# table = load_df(database) # ??
|
54 |
+
table[str(term)] = vector1
|
55 |
+
my_1_writer.safe_my_dict_as_json(database, table)
|
56 |
+
# first_key, first_value = list(sortedTable.items())[0]
|
57 |
+
print(f"the closest word to your input is: {list(sorted_table.keys())[0]}")
|
58 |
+
return sorted_table
|
59 |
+
|
60 |
+
|
61 |
+
def dot_p_to_1(database, vector1=0, analysis_filename=0):
|
62 |
+
|
63 |
+
with open(database, "r") as f:
|
64 |
+
table = json.load(f)
|
65 |
+
dot_product_to1 = {}
|
66 |
+
|
67 |
+
if vector1 == 0:
|
68 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
69 |
+
elif vector1 == 1:
|
70 |
+
vector1 = table[str(list(table.keys())[0])]
|
71 |
+
|
72 |
+
for key in table.keys():
|
73 |
+
dot_product_to1[key] = np.dot(vector1, table[key])
|
74 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, dot_product_to1)
|
75 |
+
print("dot p to 1 saved")
|
76 |
+
|
77 |
+
|
78 |
+
def lin_dist(database, vector1=0, analysis_filename=0):
|
79 |
+
with open(database, "r") as f:
|
80 |
+
table = json.load(f)
|
81 |
+
lin_dist_to_1 = {}
|
82 |
+
|
83 |
+
if vector1 == 0:
|
84 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
85 |
+
elif vector1 == 1:
|
86 |
+
vector1 = table[str(list(table.keys())[0])]
|
87 |
+
|
88 |
+
for key in table.keys():
|
89 |
+
lin_dist_to_1[key] = np.linalg.norm(np.array(vector1) - np.array(table[key]))
|
90 |
+
|
91 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, lin_dist_to_1)
|
92 |
+
print("lin dist to 1 saved")
|
93 |
+
|
94 |
+
|
95 |
+
def manhattan_dist(database, vector1=0, analysis_filename=0):
|
96 |
+
with open(database, "r") as f:
|
97 |
+
table = json.load(f)
|
98 |
+
manhattan_dist_to_1 = {}
|
99 |
+
|
100 |
+
if vector1 == 0:
|
101 |
+
vector1 = [0.025515518153991442 for _ in range(1536)]
|
102 |
+
elif vector1 == 1:
|
103 |
+
vector1 = table[str(list(table.keys())[0])]
|
104 |
+
|
105 |
+
for key in table.keys():
|
106 |
+
manhattan_dist_to_1[key] = sum(np.array(vector1) - np.array(table[key]))
|
107 |
+
|
108 |
+
my_1_writer.safe_my_dict_as_json(analysis_filename, manhattan_dist_to_1)
|
109 |
+
print("manhattan dist to 1 saved")
|