Spaces:
Sleeping
Sleeping
File size: 4,797 Bytes
3369d9f 192dc63 3369d9f 192dc63 3369d9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
import os
import shutil
import pandas as pd
class Data():
def __init__(self, inp_dir='reports', out_dir="output_reports") -> None:
self.data_dir = inp_dir
self.out_dir = out_dir
pass
def check_output(self):
'''
Create an output folder to save texts of individual PDFs
Remove folder if it exists and create new
'''
folder_path = self.out_dir
# Check if the folder exists
if os.path.exists(folder_path):
# If the folder exists, delete its content
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")
print("Folder content deleted.")
else:
# If the folder doesn't exist, create it
try:
os.makedirs(folder_path)
print("Folder created.")
except Exception as e:
print(f"Failed to create folder. Reason: {e}")
def get_faiss_embeddings(self):
'''
Splits all the documents, saves them in text format
'''
# Get a list of all PDFs in the specified directory
list_pdfs = os.listdir(self.data_dir)
# Initialize OPENAI embeddings
embedding = OpenAIEmbeddings()
# Make directories for each pdf separately
pdf_names = []
pdf_num = []
dir_num = 0
text_count = 0
for pdf in list_pdfs:
dir_num += 1
new_dir = os.path.join(self.out_dir, str(dir_num))
os.makedirs(new_dir)
print('Creating Database for PDF ' + str(dir_num))
pdf_file = os.path.join(self.data_dir, pdf)
reader = PdfReader(pdf_file)
# Get the textual content of PDF
raw_text = ''
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
raw_text += text
# Split the texts
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
text_count += len(raw_text)
print('Length of text: ' + str(len(raw_text)))
# Create Embedding
db = FAISS.from_texts(texts, embedding)
# Save Embedding
db.save_local(os.path.join(new_dir, "faiss_index"))
pdf_names.append(pdf)
pdf_num.append(dir_num)
data_df = {
"names": pdf_names,
"index": pdf_num
}
df = pd.DataFrame(data_df)
map_name = os.path.split(self.out_dir)[-1]
df.to_csv(os.path.join("outputs", "mappings", str(map_name) + ".csv"))
print('Total text in data: ' + str(text_count))
return None
def get_combined_faiss_embedding(self):
'''
Combines all the documents, saves them in ChromaDB format
'''
# Get a list of all PDFs in the specified directory
list_pdfs = os.listdir(self.data_dir)
# Initialize OPENAI embeddings
embedding = OpenAIEmbeddings()
raw_text = ''
for pdf in list_pdfs:
print('Creating Database for PDF ' + str(pdf))
pdf_file = os.path.join(self.data_dir, pdf)
reader = PdfReader(pdf_file)
# Get the textual content of PDF
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
raw_text += text
# Split the texts
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
text_count = len(raw_text)
print('Length of text: ' + str(len(raw_text)))
# Create Embedding
db = FAISS.from_texts(texts, embedding)
# Save Embedding
db.save_local(os.path.join(self.out_dir, "faiss_index"))
print('Total text in data: ' + str(text_count))
|