LanceY2004 commited on
Commit
5a7671c
1 Parent(s): 209c441

Upload test_product.py

Browse files
Files changed (1) hide show
  1. test_product.py +466 -0
test_product.py ADDED
@@ -0,0 +1,466 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tkinter as tk
3
+ from tkinter import filedialog, messagebox
4
+ import PyPDF2
5
+ import re
6
+ import json
7
+ import torch
8
+ import ollama
9
+ from openai import OpenAI
10
+ import argparse
11
+
12
+ # ANSI escape codes for colors
13
+ PINK = '\033[95m'
14
+ CYAN = '\033[96m'
15
+ YELLOW = '\033[93m'
16
+ NEON_GREEN = '\033[92m'
17
+ RESET_COLOR = '\033[0m'
18
+
19
+ # Function to open a file and return its contents as a string
20
+ def open_file(filepath):
21
+ with open(filepath, 'r', encoding='utf-8') as infile:
22
+ return infile.read()
23
+
24
+ # Function to convert PDF to text and append to vault.txt
25
+ def convert_pdf_to_text():
26
+ file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
27
+ if file_path:
28
+ base_directory = os.path.join("local-rag", "text_parse")
29
+ file_name = os.path.basename(file_path)
30
+ output_file_name = os.path.splitext(file_name)[0] + ".txt"
31
+ file_output_path = os.path.join(base_directory, output_file_name)
32
+
33
+ if not os.path.exists(base_directory):
34
+ os.makedirs(base_directory)
35
+ print(f"Directory '{base_directory}' created.")
36
+
37
+ with open(file_path, 'rb') as pdf_file:
38
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
39
+ text = ''
40
+ for page_num in range(len(pdf_reader.pages)):
41
+ page = pdf_reader.pages[page_num]
42
+ if page.extract_text():
43
+ text += page.extract_text() + " "
44
+
45
+ text = re.sub(r'\s+', ' ', text).strip()
46
+ sentences = re.split(r'(?<=[.!?]) +', text)
47
+ chunks = []
48
+ current_chunk = ""
49
+ for sentence in sentences:
50
+ if len(current_chunk) + len(sentence) + 1 < 1000:
51
+ current_chunk += (sentence + " ").strip()
52
+ else:
53
+ chunks.append(current_chunk)
54
+ current_chunk = sentence + " "
55
+ if current_chunk:
56
+ chunks.append(current_chunk)
57
+
58
+ with open(os.path.join("local-rag", "temp.txt"), "w", encoding="utf-8") as temp_file:
59
+ temp_file.write(output_file_name + "\n")
60
+ for chunk in chunks:
61
+ temp_file.write(chunk.strip() + "\n")
62
+
63
+ with open(os.path.join("local-rag", "vault.txt"), "a", encoding="utf-8") as vault_file:
64
+ vault_file.write("\n")
65
+ for chunk in chunks:
66
+ vault_file.write(chunk.strip() + "\n")
67
+
68
+ if not os.path.exists(file_output_path):
69
+ with open(file_output_path, "w", encoding="utf-8") as f:
70
+ for chunk in chunks:
71
+ f.write(chunk.strip() + "\n")
72
+ f.write("====================NOT FINISHED====================\n")
73
+ print(f"File '{file_output_path}' created with NOT FINISHED flag at the end.")
74
+ else:
75
+ print(f"File '{file_output_path}' already exists.")
76
+
77
+ print(f"PDF content appended to vault.txt with each chunk on a separate line.")
78
+ # Call the second part after the PDF conversion is done
79
+
80
+ input_value = input("Enter your question:")
81
+ process_text_files(input_value)
82
+
83
+ # Function to upload a text file and append to vault.txt
84
+ def upload_txtfile():
85
+ file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
86
+ if file_path:
87
+ # Define the base directory
88
+ base_directory = os.path.join("local-rag", "text_parse")
89
+
90
+ # Get the file name without the directory and extension
91
+ file_name = os.path.basename(file_path)
92
+ output_file_name = os.path.splitext(file_name)[0] + ".txt" # Convert PDF filename to .txt
93
+
94
+
95
+ # Construct the output file path in the base directory
96
+ file_output_path = os.path.join(base_directory, output_file_name)
97
+
98
+ # Create base directory if it doesn't exist
99
+ if not os.path.exists(base_directory):
100
+ os.makedirs(base_directory)
101
+ print(f"Directory '{base_directory}' created.")
102
+
103
+
104
+ with open(file_path, 'r', encoding="utf-8") as txt_file:
105
+ text = txt_file.read()
106
+
107
+ # Normalize whitespace and clean up text
108
+ text = re.sub(r'\s+', ' ', text).strip()
109
+
110
+ # Split text into chunks by sentences, respecting a maximum chunk size
111
+ sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
112
+ chunks = []
113
+ current_chunk = ""
114
+ for sentence in sentences:
115
+ # Check if the current sentence plus the current chunk exceeds the limit
116
+ if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
117
+ current_chunk += (sentence + " ").strip()
118
+ else:
119
+ # When the chunk exceeds 1000 characters, store it and start a new one
120
+ chunks.append(current_chunk)
121
+ current_chunk = sentence + " "
122
+ if current_chunk: # Don't forget the last chunk!
123
+ chunks.append(current_chunk)
124
+
125
+ # Clear temp.txt and write the new content
126
+ with open(os.path.join("local-rag", "temp.txt"), "w", encoding="utf-8") as temp_file:
127
+ temp_file.write(output_file_name + "\n") # Write the output file name as the first line
128
+ for chunk in chunks:
129
+ # Write each chunk to its own line
130
+ temp_file.write(chunk.strip() + "\n") # Each chunk on a new line
131
+
132
+ with open(os.path.join("local-rag", "vault.txt"), "a", encoding="utf-8") as vault_file:
133
+ vault_file.write("\n") # Add a new line to separate content
134
+ for chunk in chunks:
135
+ # Write each chunk to its own line
136
+ vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
137
+
138
+ # Create the file in the directory if it doesn't exist
139
+ if not os.path.exists(file_output_path):
140
+ with open(file_output_path, "w") as f:
141
+ f.write("") # Create an empty file
142
+ f.write("====================NOT FINISHED====================\n")
143
+ print(f"File '{file_output_path}' created with NOT FINISHED flag at the end.")
144
+ else:
145
+ print(f"File '{file_output_path}' already exists.")
146
+
147
+ print(f"Text file content appended to vault.txt with each chunk on a separate line.")
148
+
149
+ input_value = input("Enter your question:")
150
+ process_text_files(input_value)
151
+ else:
152
+ print("No file selected.")
153
+
154
+ # Function to upload a JSON file and append to vault.txt
155
+ def upload_jsonfile():
156
+ file_path = filedialog.askopenfilename(filetypes=[("JSON Files", "*.json")])
157
+ if file_path:
158
+
159
+ # Define the base directory
160
+ base_directory = os.path.join("local-rag", "text_parse")
161
+
162
+ # Get the file name without the directory and extension
163
+ file_name = os.path.basename(file_path)
164
+ output_file_name = os.path.splitext(file_name)[0] + ".txt" # Convert PDF filename to .txt
165
+
166
+
167
+ # Construct the output file path in the base directory
168
+ file_output_path = os.path.join(base_directory, output_file_name)
169
+
170
+ # Create base directory if it doesn't exist
171
+ if not os.path.exists(base_directory):
172
+ os.makedirs(base_directory)
173
+ print(f"Directory '{base_directory}' created.")
174
+
175
+
176
+
177
+
178
+ with open(file_path, 'r', encoding="utf-8") as json_file:
179
+ data = json.load(json_file)
180
+
181
+ # Flatten the JSON data into a single string
182
+ text = json.dumps(data, ensure_ascii=False)
183
+
184
+ # Normalize whitespace and clean up text
185
+ text = re.sub(r'\s+', ' ', text).strip()
186
+
187
+ # Split text into chunks by sentences, respecting a maximum chunk size
188
+ sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
189
+ chunks = []
190
+ current_chunk = ""
191
+ for sentence in sentences:
192
+ # Check if the current sentence plus the current chunk exceeds the limit
193
+ if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
194
+ current_chunk += (sentence + " ").strip()
195
+ else:
196
+ # When the chunk exceeds 1000 characters, store it and start a new one
197
+ chunks.append(current_chunk)
198
+ current_chunk = sentence + " "
199
+ if current_chunk: # Don't forget the last chunk!
200
+ chunks.append(current_chunk)
201
+
202
+ # Clear temp.txt and write the new content
203
+ with open(os.path.join("local-rag", "temp.txt"), "w", encoding="utf-8") as temp_file:
204
+ temp_file.write(output_file_name + "\n") # Write the output file name as the first line
205
+ for chunk in chunks:
206
+ # Write each chunk to its own line
207
+ temp_file.write(chunk.strip() + "\n") # Each chunk on a new line
208
+
209
+ with open(os.path.join("local-rag", "vault.txt"), "a", encoding="utf-8") as vault_file:
210
+ vault_file.write("\n") # Add a new line to separate content
211
+ for chunk in chunks:
212
+ # Write each chunk to its own line
213
+ vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
214
+
215
+ if not os.path.exists(file_output_path):
216
+ with open(file_output_path, "w", encoding="utf-8") as f:
217
+ for chunk in chunks:
218
+ f.write(chunk.strip() + "\n") # Each chunk on a new line
219
+ f.write("====================NOT FINISHED====================\n")
220
+ print(f"File '{file_output_path}' created with NOT FINISHED flag at the end.")
221
+ else:
222
+ print(f"File '{file_output_path}' already exists.")
223
+
224
+
225
+
226
+ print(f"JSON file content appended to vault.txt with each chunk on a separate line.")
227
+
228
+ input_value = input("Enter your question:")
229
+ process_text_files(input_value)
230
+
231
+ def summarize():
232
+ summary_window = tk.Toplevel(root)
233
+ summary_window.title("Text Summarizer")
234
+ summary_window.geometry("400x200")
235
+
236
+ # Create a label for the window
237
+ label = tk.Label(summary_window, text="Choose an option to summarize text:")
238
+ label.pack(pady=10)
239
+
240
+ # Create two buttons: one for uploading a .txt file, and one for pasting text directly
241
+ upload_button = tk.Button(summary_window, text="Upload from .txt File", command=summarize_from_file)
242
+ upload_button.pack(pady=5)
243
+
244
+ paste_button = tk.Button(summary_window, text="Paste your text", command=lambda: open_paste_window(summary_window))
245
+ paste_button.pack(pady=5)
246
+
247
+ # Function to upload a .txt file and summarize
248
+ def summarize_from_file():
249
+ file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
250
+ if file_path:
251
+ # Define the base directory where the file will be saved
252
+ base_directory = os.path.join("local-rag", "text_sum")
253
+
254
+ file_name = os.path.basename(file_path)
255
+
256
+ # Create the directory if it doesn't exist
257
+ if not os.path.exists(base_directory):
258
+ os.makedirs(base_directory)
259
+ print(f"Directory '{base_directory}' created.")
260
+
261
+ summary_content = []
262
+ if os.path.exists(file_name):
263
+ with open(file_name, "r", encoding='utf-8') as sum_file:
264
+ summary_content = sum_file.readlines()
265
+
266
+ summary_embeddings = []
267
+ for content in summary_content:
268
+ response = ollama.embeddings(model='mxbai-embed-large', prompt=content)
269
+ summary_embeddings.append(response["embedding"])
270
+
271
+ summary_embeddings_tensor = torch.tensor(summary_embeddings)
272
+ print("Embeddings for each line in the vault:")
273
+ print(summary_embeddings_tensor)
274
+
275
+ conversation_history = []
276
+ system_message = "You are a helpful assistant that is an expert at summarizing the text from a given document"
277
+ user_input = "Summarize this paragraph"
278
+
279
+ response = ollama_chat(user_input, system_message, summary_embeddings_tensor, summary_content, args.model, conversation_history)
280
+
281
+ messagebox.showinfo("Summary", response) # Replace with actual summarizing logic
282
+ else:
283
+ messagebox.showerror("Error", "No file selected!")
284
+
285
+ # Function to open a window for pasting text and summarizing
286
+ def open_paste_window(parent_window):
287
+ # Create a new window for pasting text
288
+ paste_window = tk.Toplevel(parent_window)
289
+ paste_window.title("Paste Your Text")
290
+ paste_window.geometry("400x300")
291
+
292
+ # Create a label and text box for the pasted text
293
+ label = tk.Label(paste_window, text="Paste your text below:")
294
+ label.pack(pady=5)
295
+
296
+ input_textbox = tk.Text(paste_window, height=8, width=40)
297
+ input_textbox.pack(pady=5)
298
+
299
+ # Function to handle the "Submit" button click
300
+ def submit_text():
301
+ pasted_text = input_textbox.get("1.0", tk.END).strip()
302
+ if pasted_text:
303
+
304
+ system_message = "You are a helpful assistant that is an expert at summarizing the text from a given document"
305
+ user_input = "Summarize this paragraph:"
306
+ new_value = user_input + pasted_text
307
+ messages = [
308
+ {
309
+ "system",
310
+ system_message,
311
+ },
312
+ {"human", new_value},
313
+ ]
314
+ response = client.chat.completions.create(model=args.model, messages=messages)
315
+
316
+ response_value = response.choices[0].message.content
317
+
318
+
319
+ messagebox.showinfo("Summary", response_value) # Replace with actual summarizing logic
320
+ paste_window.destroy() # Close the window
321
+ else:
322
+ messagebox.showerror("Error", "No text entered!")
323
+
324
+ # Add Submit and Cancel buttons
325
+ submit_button = tk.Button(paste_window, text="Submit", command=submit_text)
326
+ submit_button.pack(side=tk.LEFT, padx=10, pady=10)
327
+
328
+ cancel_button = tk.Button(paste_window, text="Cancel", command=paste_window.destroy)
329
+ cancel_button.pack(side=tk.RIGHT, padx=10, pady=10)
330
+
331
+
332
+ # Function to get relevant context from the vault based on user input
333
+ def get_relevant_context(rewritten_input, vault_embeddings, vault_content, top_k=3):
334
+ if vault_embeddings.nelement() == 0:
335
+ return []
336
+ input_embedding = ollama.embeddings(model='mxbai-embed-large', prompt=rewritten_input)["embedding"]
337
+ cos_scores = torch.cosine_similarity(torch.tensor(input_embedding).unsqueeze(0), vault_embeddings)
338
+ top_k = min(top_k, len(cos_scores))
339
+ top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()
340
+ relevant_context = [vault_content[idx].strip() for idx in top_indices]
341
+ return relevant_context
342
+
343
+ # Function to interact with the Ollama model
344
+ def ollama_chat(user_input, system_message, vault_embeddings, vault_content, ollama_model, conversation_history):
345
+ relevant_context = get_relevant_context(user_input, vault_embeddings, vault_content, top_k=3)
346
+ if relevant_context:
347
+ context_str = "\n".join(relevant_context)
348
+ print("Context Pulled from Documents: \n\n" + CYAN + context_str + RESET_COLOR)
349
+ else:
350
+ print(CYAN + "No relevant context found." + RESET_COLOR)
351
+
352
+ user_input_with_context = user_input
353
+ if relevant_context:
354
+ user_input_with_context = context_str + "\n\n" + user_input
355
+
356
+ conversation_history.append({"role": "user", "content": user_input_with_context})
357
+ messages = [{"role": "system", "content": system_message}, *conversation_history]
358
+
359
+ response = client.chat.completions.create(model=ollama_model, messages=messages)
360
+ conversation_history.append({"role": "assistant", "content": response.choices[0].message.content})
361
+
362
+ return response.choices[0].message.content
363
+
364
+ # Function to process text files, check for NOT FINISHED flag, and compute embeddings
365
+ def process_text_files(user_input):
366
+ text_parse_directory = os.path.join("local-rag", "text_parse")
367
+ temp_file_path = os.path.join("local-rag", "temp.txt")
368
+
369
+ if not os.path.exists(text_parse_directory):
370
+ print(f"Directory '{text_parse_directory}' does not exist.")
371
+ return False
372
+
373
+ if not os.path.exists(temp_file_path):
374
+ print("temp.txt does not exist.")
375
+ return False
376
+
377
+ with open(temp_file_path, 'r', encoding='utf-8') as temp_file:
378
+ first_line = temp_file.readline().strip()
379
+
380
+ text_files = [f for f in os.listdir(text_parse_directory) if f.endswith('.txt')]
381
+
382
+ if f"{first_line}" not in text_files:
383
+ print(f"No matching file found for '{first_line}.txt' in text_parse directory.")
384
+ return False
385
+
386
+ file_path = os.path.join(text_parse_directory, f"{first_line}")
387
+ with open(file_path, 'r', encoding='utf-8') as f:
388
+ lines = f.readlines()
389
+
390
+ lines = [line.strip() for line in lines]
391
+
392
+ if len(lines) >= 2 and lines[-1] == "====================NOT FINISHED====================":
393
+ print(f"'{first_line}' contains the 'NOT FINISHED' flag. Computing embeddings.")
394
+
395
+ vault_content = []
396
+ if os.path.exists(temp_file_path):
397
+ with open(temp_file_path, "r", encoding='utf-8') as vault_file:
398
+ vault_content = vault_file.readlines()
399
+
400
+ vault_embeddings = []
401
+ for content in vault_content:
402
+ response = ollama.embeddings(model='mxbai-embed-large', prompt=content)
403
+ vault_embeddings.append(response["embedding"])
404
+
405
+ vault_embeddings_tensor = torch.tensor(vault_embeddings)
406
+ print("Embeddings for each line in the vault:")
407
+ print(vault_embeddings_tensor)
408
+
409
+ with open(os.path.join(text_parse_directory, f"{first_line}_embedding.pt"), "wb") as tensor_file:
410
+ torch.save(vault_embeddings_tensor, tensor_file)
411
+
412
+ with open(file_path, 'w', encoding='utf-8') as f:
413
+ f.writelines(lines[:-1])
414
+
415
+ else:
416
+ print(f"'{first_line}' does not contain the 'NOT FINISHED' flag or is already complete. Loading tensor if it exists.")
417
+
418
+ tensor_file_path = os.path.join(text_parse_directory, f"{first_line}_embedding.pt")
419
+ if os.path.exists(tensor_file_path):
420
+ vault_embeddings_tensor = torch.load(tensor_file_path)
421
+ print("Loaded Vault Embedding Tensor:")
422
+ print(vault_embeddings_tensor)
423
+
424
+ vault_content = []
425
+ if os.path.exists(temp_file_path):
426
+ with open(temp_file_path, "r", encoding='utf-8') as vault_file:
427
+ vault_content = vault_file.readlines()
428
+
429
+ conversation_history = []
430
+ system_message = "You are a helpful assistant that is an expert at extracting the most useful information from a given text"
431
+ response = ollama_chat(user_input, system_message, vault_embeddings_tensor, vault_content, args.model, conversation_history)
432
+
433
+ print (response)
434
+
435
+ return response
436
+
437
+ # Create the main window
438
+ root = tk.Tk()
439
+ root.title("Upload .pdf, .txt, or .json")
440
+
441
+ # Create a button to open the file dialog for PDF
442
+ pdf_button = tk.Button(root, text="Upload PDF", command=convert_pdf_to_text)
443
+ pdf_button.pack(pady=15)
444
+
445
+ # Create a button to open the file dialog for text file
446
+ txt_button = tk.Button(root, text="Upload Text File", command=upload_txtfile)
447
+ txt_button.pack(pady=15)
448
+
449
+ # Create a button to open the file dialog for JSON file
450
+ json_button = tk.Button(root, text="Upload JSON File", command=upload_jsonfile)
451
+ json_button.pack(pady=15)
452
+
453
+ # Create a button to open the summerizer
454
+ json_button = tk.Button(root, text="Summarize This!", command=summarize)
455
+ json_button.pack(pady=15)
456
+
457
+ # Configuration for the Ollama API client
458
+ client = OpenAI(base_url='http://localhost:11434/v1', api_key='llama3')
459
+
460
+ # Parse command-line arguments
461
+ parser = argparse.ArgumentParser(description="Ollama Chat")
462
+ parser.add_argument("--model", default="llama3", help="Ollama model to use (default: llama3)")
463
+ args = parser.parse_args()
464
+
465
+ # Run the main event loop
466
+ root.mainloop()