LanceY2004 commited on
Commit
bbd6a4c
1 Parent(s): f0307ae

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +468 -1
README.md CHANGED
@@ -5,4 +5,471 @@ language:
5
  base_model:
6
  - meta-llama/Llama-3.1-8B
7
  pipeline_tag: reinforcement-learning
8
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  base_model:
6
  - meta-llama/Llama-3.1-8B
7
  pipeline_tag: reinforcement-learning
8
+ ---
9
+
10
+ import os
11
+ import tkinter as tk
12
+ from tkinter import filedialog, messagebox
13
+ import PyPDF2
14
+ import re
15
+ import json
16
+ import torch
17
+ import ollama
18
+ from openai import OpenAI
19
+ import argparse
20
+
21
+ # ANSI escape codes for colors
22
+ PINK = '\033[95m'
23
+ CYAN = '\033[96m'
24
+ YELLOW = '\033[93m'
25
+ NEON_GREEN = '\033[92m'
26
+ RESET_COLOR = '\033[0m'
27
+
28
+ # Function to open a file and return its contents as a string
29
+ def open_file(filepath):
30
+ with open(filepath, 'r', encoding='utf-8') as infile:
31
+ return infile.read()
32
+
33
+ # Function to convert PDF to text and append to vault.txt
34
+ def convert_pdf_to_text():
35
+ file_path = filedialog.askopenfilename(filetypes=[("PDF Files", "*.pdf")])
36
+ if file_path:
37
+ base_directory = os.path.join("local-rag", "text_parse")
38
+ file_name = os.path.basename(file_path)
39
+ output_file_name = os.path.splitext(file_name)[0] + ".txt"
40
+ file_output_path = os.path.join(base_directory, output_file_name)
41
+
42
+ if not os.path.exists(base_directory):
43
+ os.makedirs(base_directory)
44
+ print(f"Directory '{base_directory}' created.")
45
+
46
+ with open(file_path, 'rb') as pdf_file:
47
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
48
+ text = ''
49
+ for page_num in range(len(pdf_reader.pages)):
50
+ page = pdf_reader.pages[page_num]
51
+ if page.extract_text():
52
+ text += page.extract_text() + " "
53
+
54
+ text = re.sub(r'\s+', ' ', text).strip()
55
+ sentences = re.split(r'(?<=[.!?]) +', text)
56
+ chunks = []
57
+ current_chunk = ""
58
+ for sentence in sentences:
59
+ if len(current_chunk) + len(sentence) + 1 < 1000:
60
+ current_chunk += (sentence + " ").strip()
61
+ else:
62
+ chunks.append(current_chunk)
63
+ current_chunk = sentence + " "
64
+ if current_chunk:
65
+ chunks.append(current_chunk)
66
+
67
+ with open(os.path.join("local-rag", "temp.txt"), "w", encoding="utf-8") as temp_file:
68
+ temp_file.write(output_file_name + "\n")
69
+ for chunk in chunks:
70
+ temp_file.write(chunk.strip() + "\n")
71
+
72
+ with open(os.path.join("local-rag", "vault.txt"), "a", encoding="utf-8") as vault_file:
73
+ vault_file.write("\n")
74
+ for chunk in chunks:
75
+ vault_file.write(chunk.strip() + "\n")
76
+
77
+ if not os.path.exists(file_output_path):
78
+ with open(file_output_path, "w", encoding="utf-8") as f:
79
+ for chunk in chunks:
80
+ f.write(chunk.strip() + "\n")
81
+ f.write("====================NOT FINISHED====================\n")
82
+ print(f"File '{file_output_path}' created with NOT FINISHED flag at the end.")
83
+ else:
84
+ print(f"File '{file_output_path}' already exists.")
85
+
86
+ print(f"PDF content appended to vault.txt with each chunk on a separate line.")
87
+ # Call the second part after the PDF conversion is done
88
+
89
+ input_value = input("Enter your question:")
90
+ process_text_files(input_value)
91
+
92
+ # Function to upload a text file and append to vault.txt
93
+ def upload_txtfile():
94
+ file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
95
+ if file_path:
96
+ # Define the base directory
97
+ base_directory = os.path.join("local-rag", "text_parse")
98
+
99
+ # Get the file name without the directory and extension
100
+ file_name = os.path.basename(file_path)
101
+ output_file_name = os.path.splitext(file_name)[0] + ".txt" # Convert PDF filename to .txt
102
+
103
+
104
+ # Construct the output file path in the base directory
105
+ file_output_path = os.path.join(base_directory, output_file_name)
106
+
107
+ # Create base directory if it doesn't exist
108
+ if not os.path.exists(base_directory):
109
+ os.makedirs(base_directory)
110
+ print(f"Directory '{base_directory}' created.")
111
+
112
+
113
+ with open(file_path, 'r', encoding="utf-8") as txt_file:
114
+ text = txt_file.read()
115
+
116
+ # Normalize whitespace and clean up text
117
+ text = re.sub(r'\s+', ' ', text).strip()
118
+
119
+ # Split text into chunks by sentences, respecting a maximum chunk size
120
+ sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
121
+ chunks = []
122
+ current_chunk = ""
123
+ for sentence in sentences:
124
+ # Check if the current sentence plus the current chunk exceeds the limit
125
+ if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
126
+ current_chunk += (sentence + " ").strip()
127
+ else:
128
+ # When the chunk exceeds 1000 characters, store it and start a new one
129
+ chunks.append(current_chunk)
130
+ current_chunk = sentence + " "
131
+ if current_chunk: # Don't forget the last chunk!
132
+ chunks.append(current_chunk)
133
+
134
+ # Clear temp.txt and write the new content
135
+ with open(os.path.join("local-rag", "temp.txt"), "w", encoding="utf-8") as temp_file:
136
+ temp_file.write(output_file_name + "\n") # Write the output file name as the first line
137
+ for chunk in chunks:
138
+ # Write each chunk to its own line
139
+ temp_file.write(chunk.strip() + "\n") # Each chunk on a new line
140
+
141
+ with open(os.path.join("local-rag", "vault.txt"), "a", encoding="utf-8") as vault_file:
142
+ vault_file.write("\n") # Add a new line to separate content
143
+ for chunk in chunks:
144
+ # Write each chunk to its own line
145
+ vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
146
+
147
+ # Create the file in the directory if it doesn't exist
148
+ if not os.path.exists(file_output_path):
149
+ with open(file_output_path, "w") as f:
150
+ f.write("") # Create an empty file
151
+ f.write("====================NOT FINISHED====================\n")
152
+ print(f"File '{file_output_path}' created with NOT FINISHED flag at the end.")
153
+ else:
154
+ print(f"File '{file_output_path}' already exists.")
155
+
156
+ print(f"Text file content appended to vault.txt with each chunk on a separate line.")
157
+
158
+ input_value = input("Enter your question:")
159
+ process_text_files(input_value)
160
+ else:
161
+ print("No file selected.")
162
+
163
+ # Function to upload a JSON file and append to vault.txt
164
+ def upload_jsonfile():
165
+ file_path = filedialog.askopenfilename(filetypes=[("JSON Files", "*.json")])
166
+ if file_path:
167
+
168
+ # Define the base directory
169
+ base_directory = os.path.join("local-rag", "text_parse")
170
+
171
+ # Get the file name without the directory and extension
172
+ file_name = os.path.basename(file_path)
173
+ output_file_name = os.path.splitext(file_name)[0] + ".txt" # Convert PDF filename to .txt
174
+
175
+
176
+ # Construct the output file path in the base directory
177
+ file_output_path = os.path.join(base_directory, output_file_name)
178
+
179
+ # Create base directory if it doesn't exist
180
+ if not os.path.exists(base_directory):
181
+ os.makedirs(base_directory)
182
+ print(f"Directory '{base_directory}' created.")
183
+
184
+
185
+
186
+
187
+ with open(file_path, 'r', encoding="utf-8") as json_file:
188
+ data = json.load(json_file)
189
+
190
+ # Flatten the JSON data into a single string
191
+ text = json.dumps(data, ensure_ascii=False)
192
+
193
+ # Normalize whitespace and clean up text
194
+ text = re.sub(r'\s+', ' ', text).strip()
195
+
196
+ # Split text into chunks by sentences, respecting a maximum chunk size
197
+ sentences = re.split(r'(?<=[.!?]) +', text) # split on spaces following sentence-ending punctuation
198
+ chunks = []
199
+ current_chunk = ""
200
+ for sentence in sentences:
201
+ # Check if the current sentence plus the current chunk exceeds the limit
202
+ if len(current_chunk) + len(sentence) + 1 < 1000: # +1 for the space
203
+ current_chunk += (sentence + " ").strip()
204
+ else:
205
+ # When the chunk exceeds 1000 characters, store it and start a new one
206
+ chunks.append(current_chunk)
207
+ current_chunk = sentence + " "
208
+ if current_chunk: # Don't forget the last chunk!
209
+ chunks.append(current_chunk)
210
+
211
+ # Clear temp.txt and write the new content
212
+ with open(os.path.join("local-rag", "temp.txt"), "w", encoding="utf-8") as temp_file:
213
+ temp_file.write(output_file_name + "\n") # Write the output file name as the first line
214
+ for chunk in chunks:
215
+ # Write each chunk to its own line
216
+ temp_file.write(chunk.strip() + "\n") # Each chunk on a new line
217
+
218
+ with open(os.path.join("local-rag", "vault.txt"), "a", encoding="utf-8") as vault_file:
219
+ vault_file.write("\n") # Add a new line to separate content
220
+ for chunk in chunks:
221
+ # Write each chunk to its own line
222
+ vault_file.write(chunk.strip() + "\n") # Two newlines to separate chunks
223
+
224
+ if not os.path.exists(file_output_path):
225
+ with open(file_output_path, "w", encoding="utf-8") as f:
226
+ for chunk in chunks:
227
+ f.write(chunk.strip() + "\n") # Each chunk on a new line
228
+ f.write("====================NOT FINISHED====================\n")
229
+ print(f"File '{file_output_path}' created with NOT FINISHED flag at the end.")
230
+ else:
231
+ print(f"File '{file_output_path}' already exists.")
232
+
233
+
234
+
235
+ print(f"JSON file content appended to vault.txt with each chunk on a separate line.")
236
+
237
+ input_value = input("Enter your question:")
238
+ process_text_files(input_value)
239
+
240
+ def summarize():
241
+ summary_window = tk.Toplevel(root)
242
+ summary_window.title("Text Summarizer")
243
+ summary_window.geometry("400x200")
244
+
245
+ # Create a label for the window
246
+ label = tk.Label(summary_window, text="Choose an option to summarize text:")
247
+ label.pack(pady=10)
248
+
249
+ # Create two buttons: one for uploading a .txt file, and one for pasting text directly
250
+ upload_button = tk.Button(summary_window, text="Upload from .txt File", command=summarize_from_file)
251
+ upload_button.pack(pady=5)
252
+
253
+ paste_button = tk.Button(summary_window, text="Paste your text", command=lambda: open_paste_window(summary_window))
254
+ paste_button.pack(pady=5)
255
+
256
+ # Function to upload a .txt file and summarize
257
+ def summarize_from_file():
258
+ file_path = filedialog.askopenfilename(filetypes=[("Text Files", "*.txt")])
259
+ if file_path:
260
+ # Define the base directory where the file will be saved
261
+ base_directory = os.path.join("local-rag", "text_sum")
262
+
263
+ file_name = os.path.basename(file_path)
264
+
265
+ # Create the directory if it doesn't exist
266
+ if not os.path.exists(base_directory):
267
+ os.makedirs(base_directory)
268
+ print(f"Directory '{base_directory}' created.")
269
+
270
+ summary_content = []
271
+ if os.path.exists(file_name):
272
+ with open(file_name, "r", encoding='utf-8') as sum_file:
273
+ summary_content = sum_file.readlines()
274
+
275
+ summary_embeddings = []
276
+ for content in summary_content:
277
+ response = ollama.embeddings(model='mxbai-embed-large', prompt=content)
278
+ summary_embeddings.append(response["embedding"])
279
+
280
+ summary_embeddings_tensor = torch.tensor(summary_embeddings)
281
+ print("Embeddings for each line in the vault:")
282
+ print(summary_embeddings_tensor)
283
+
284
+ conversation_history = []
285
+ system_message = "You are a helpful assistant that is an expert at summarizing the text from a given document"
286
+ user_input = "Summarize this paragraph"
287
+
288
+ response = ollama_chat(user_input, system_message, summary_embeddings_tensor, summary_content, args.model, conversation_history)
289
+
290
+ messagebox.showinfo("Summary", response) # Replace with actual summarizing logic
291
+ else:
292
+ messagebox.showerror("Error", "No file selected!")
293
+
294
+ # Function to open a window for pasting text and summarizing
295
+ def open_paste_window(parent_window):
296
+ # Create a new window for pasting text
297
+ paste_window = tk.Toplevel(parent_window)
298
+ paste_window.title("Paste Your Text")
299
+ paste_window.geometry("400x300")
300
+
301
+ # Create a label and text box for the pasted text
302
+ label = tk.Label(paste_window, text="Paste your text below:")
303
+ label.pack(pady=5)
304
+
305
+ input_textbox = tk.Text(paste_window, height=8, width=40)
306
+ input_textbox.pack(pady=5)
307
+
308
+ # Function to handle the "Submit" button click
309
+ def submit_text():
310
+ pasted_text = input_textbox.get("1.0", tk.END).strip()
311
+ if pasted_text:
312
+
313
+ system_message = "You are a helpful assistant that is an expert at summarizing the text from a given document"
314
+ user_input = "Summarize this paragraph:"
315
+ new_value = user_input + pasted_text
316
+ messages = [
317
+ {
318
+ "system",
319
+ system_message,
320
+ },
321
+ {"human", new_value},
322
+ ]
323
+ response = client.chat.completions.create(model=args.model, messages=messages)
324
+
325
+ response_value = response.choices[0].message.content
326
+
327
+
328
+ messagebox.showinfo("Summary", response_value) # Replace with actual summarizing logic
329
+ paste_window.destroy() # Close the window
330
+ else:
331
+ messagebox.showerror("Error", "No text entered!")
332
+
333
+ # Add Submit and Cancel buttons
334
+ submit_button = tk.Button(paste_window, text="Submit", command=submit_text)
335
+ submit_button.pack(side=tk.LEFT, padx=10, pady=10)
336
+
337
+ cancel_button = tk.Button(paste_window, text="Cancel", command=paste_window.destroy)
338
+ cancel_button.pack(side=tk.RIGHT, padx=10, pady=10)
339
+
340
+
341
+ # Function to get relevant context from the vault based on user input
342
+ def get_relevant_context(rewritten_input, vault_embeddings, vault_content, top_k=3):
343
+ if vault_embeddings.nelement() == 0:
344
+ return []
345
+ input_embedding = ollama.embeddings(model='mxbai-embed-large', prompt=rewritten_input)["embedding"]
346
+ cos_scores = torch.cosine_similarity(torch.tensor(input_embedding).unsqueeze(0), vault_embeddings)
347
+ top_k = min(top_k, len(cos_scores))
348
+ top_indices = torch.topk(cos_scores, k=top_k)[1].tolist()
349
+ relevant_context = [vault_content[idx].strip() for idx in top_indices]
350
+ return relevant_context
351
+
352
+ # Function to interact with the Ollama model
353
+ def ollama_chat(user_input, system_message, vault_embeddings, vault_content, ollama_model, conversation_history):
354
+ relevant_context = get_relevant_context(user_input, vault_embeddings, vault_content, top_k=3)
355
+ if relevant_context:
356
+ context_str = "\n".join(relevant_context)
357
+ print("Context Pulled from Documents: \n\n" + CYAN + context_str + RESET_COLOR)
358
+ else:
359
+ print(CYAN + "No relevant context found." + RESET_COLOR)
360
+
361
+ user_input_with_context = user_input
362
+ if relevant_context:
363
+ user_input_with_context = context_str + "\n\n" + user_input
364
+
365
+ conversation_history.append({"role": "user", "content": user_input_with_context})
366
+ messages = [{"role": "system", "content": system_message}, *conversation_history]
367
+
368
+ response = client.chat.completions.create(model=ollama_model, messages=messages)
369
+ conversation_history.append({"role": "assistant", "content": response.choices[0].message.content})
370
+
371
+ return response.choices[0].message.content
372
+
373
+ # Function to process text files, check for NOT FINISHED flag, and compute embeddings
374
+ def process_text_files(user_input):
375
+ text_parse_directory = os.path.join("local-rag", "text_parse")
376
+ temp_file_path = os.path.join("local-rag", "temp.txt")
377
+
378
+ if not os.path.exists(text_parse_directory):
379
+ print(f"Directory '{text_parse_directory}' does not exist.")
380
+ return False
381
+
382
+ if not os.path.exists(temp_file_path):
383
+ print("temp.txt does not exist.")
384
+ return False
385
+
386
+ with open(temp_file_path, 'r', encoding='utf-8') as temp_file:
387
+ first_line = temp_file.readline().strip()
388
+
389
+ text_files = [f for f in os.listdir(text_parse_directory) if f.endswith('.txt')]
390
+
391
+ if f"{first_line}" not in text_files:
392
+ print(f"No matching file found for '{first_line}.txt' in text_parse directory.")
393
+ return False
394
+
395
+ file_path = os.path.join(text_parse_directory, f"{first_line}")
396
+ with open(file_path, 'r', encoding='utf-8') as f:
397
+ lines = f.readlines()
398
+
399
+ lines = [line.strip() for line in lines]
400
+
401
+ if len(lines) >= 2 and lines[-1] == "====================NOT FINISHED====================":
402
+ print(f"'{first_line}' contains the 'NOT FINISHED' flag. Computing embeddings.")
403
+
404
+ vault_content = []
405
+ if os.path.exists(temp_file_path):
406
+ with open(temp_file_path, "r", encoding='utf-8') as vault_file:
407
+ vault_content = vault_file.readlines()
408
+
409
+ vault_embeddings = []
410
+ for content in vault_content:
411
+ response = ollama.embeddings(model='mxbai-embed-large', prompt=content)
412
+ vault_embeddings.append(response["embedding"])
413
+
414
+ vault_embeddings_tensor = torch.tensor(vault_embeddings)
415
+ print("Embeddings for each line in the vault:")
416
+ print(vault_embeddings_tensor)
417
+
418
+ with open(os.path.join(text_parse_directory, f"{first_line}_embedding.pt"), "wb") as tensor_file:
419
+ torch.save(vault_embeddings_tensor, tensor_file)
420
+
421
+ with open(file_path, 'w', encoding='utf-8') as f:
422
+ f.writelines(lines[:-1])
423
+
424
+ else:
425
+ print(f"'{first_line}' does not contain the 'NOT FINISHED' flag or is already complete. Loading tensor if it exists.")
426
+
427
+ tensor_file_path = os.path.join(text_parse_directory, f"{first_line}_embedding.pt")
428
+ if os.path.exists(tensor_file_path):
429
+ vault_embeddings_tensor = torch.load(tensor_file_path)
430
+ print("Loaded Vault Embedding Tensor:")
431
+ print(vault_embeddings_tensor)
432
+
433
+ vault_content = []
434
+ if os.path.exists(temp_file_path):
435
+ with open(temp_file_path, "r", encoding='utf-8') as vault_file:
436
+ vault_content = vault_file.readlines()
437
+
438
+ conversation_history = []
439
+ system_message = "You are a helpful assistant that is an expert at extracting the most useful information from a given text"
440
+ response = ollama_chat(user_input, system_message, vault_embeddings_tensor, vault_content, args.model, conversation_history)
441
+
442
+ print (response)
443
+
444
+ return response
445
+
446
+ # Create the main window
447
+ root = tk.Tk()
448
+ root.title("Upload .pdf, .txt, or .json")
449
+
450
+ # Create a button to open the file dialog for PDF
451
+ pdf_button = tk.Button(root, text="Upload PDF", command=convert_pdf_to_text)
452
+ pdf_button.pack(pady=15)
453
+
454
+ # Create a button to open the file dialog for text file
455
+ txt_button = tk.Button(root, text="Upload Text File", command=upload_txtfile)
456
+ txt_button.pack(pady=15)
457
+
458
+ # Create a button to open the file dialog for JSON file
459
+ json_button = tk.Button(root, text="Upload JSON File", command=upload_jsonfile)
460
+ json_button.pack(pady=15)
461
+
462
+ # Create a button to open the summerizer
463
+ json_button = tk.Button(root, text="Summarize This!", command=summarize)
464
+ json_button.pack(pady=15)
465
+
466
+ # Configuration for the Ollama API client
467
+ client = OpenAI(base_url='http://localhost:11434/v1', api_key='llama3')
468
+
469
+ # Parse command-line arguments
470
+ parser = argparse.ArgumentParser(description="Ollama Chat")
471
+ parser.add_argument("--model", default="llama3", help="Ollama model to use (default: llama3)")
472
+ args = parser.parse_args()
473
+
474
+ # Run the main event loop
475
+ root.mainloop()