Spaces:
Sleeping
Sleeping
File size: 3,473 Bytes
3369d9f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
import os
import shutil
import pandas as pd
class Data():
def __init__(self, inp_dir='reports', out_dir="output_reports") -> None:
self.data_dir = inp_dir
self.out_dir = out_dir
pass
def check_output(self):
'''
Create an output folder to save texts of individual PDFs
Remove folder if it exists and create new
'''
folder_path = self.out_dir
# Check if the folder exists
if os.path.exists(folder_path):
# If the folder exists, delete its content
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
shutil.rmtree(file_path)
except Exception as e:
print(f"Failed to delete {file_path}. Reason: {e}")
print("Folder content deleted.")
else:
# If the folder doesn't exist, create it
try:
os.makedirs(folder_path)
print("Folder created.")
except Exception as e:
print(f"Failed to create folder. Reason: {e}")
def get_faiss_embeddings(self):
'''
Splits all the reports, saves them in text format
'''
# Get a list of all PDFs in the specified directory
list_pdfs = os.listdir(self.data_dir)
# Initialize OPENAI embeddings
embedding = OpenAIEmbeddings()
# Make directories for each pdf separately
pdf_names = []
pdf_num = []
dir_num = 0
text_count = 0
for pdf in list_pdfs:
dir_num += 1
new_dir = os.path.join(self.out_dir, str(dir_num))
os.makedirs(new_dir)
print('Creating Database for PDF ' + str(dir_num))
pdf_file = os.path.join(self.data_dir, pdf)
reader = PdfReader(pdf_file)
# Get the textual content of PDF
raw_text = ''
for i, page in enumerate(reader.pages):
text = page.extract_text()
if text:
raw_text += text
# Split the texts
text_splitter = CharacterTextSplitter(
separator = "\n",
chunk_size = 1000,
chunk_overlap = 200,
length_function = len,
)
texts = text_splitter.split_text(raw_text)
text_count += len(raw_text)
print('Length of text: ' + str(len(raw_text)))
# Create Embedding
db = FAISS.from_texts(texts, embedding)
# Save Embedding
db.save_local(os.path.join(new_dir, "faiss_index"))
pdf_names.append(pdf)
pdf_num.append(dir_num)
data_df = {
"names": pdf_names,
"index": pdf_num
}
df = pd.DataFrame(data_df)
map_name = os.path.split(self.out_dir)[-1]
df.to_csv(os.path.join("outputs", "mappings", str(map_name) + ".csv"))
print('Total text in data: ' + str(text_count))
return None
|