File size: 4,797 Bytes
3369d9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192dc63
3369d9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192dc63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3369d9f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS 

import os 
import shutil

import pandas as pd

class Data():
    def __init__(self, inp_dir='reports', out_dir="output_reports") -> None:
        self.data_dir = inp_dir
        self.out_dir = out_dir
        pass

    def check_output(self):
        '''
        Create an output folder to save texts of individual PDFs
        Remove folder if it exists and create new
        '''
        folder_path = self.out_dir
        # Check if the folder exists
        if os.path.exists(folder_path):
            # If the folder exists, delete its content
            for filename in os.listdir(folder_path):
                file_path = os.path.join(folder_path, filename)
                try:
                    if os.path.isfile(file_path):
                        os.unlink(file_path)
                    elif os.path.isdir(file_path):
                        shutil.rmtree(file_path)
                except Exception as e:
                    print(f"Failed to delete {file_path}. Reason: {e}")
            print("Folder content deleted.")
        else:
            # If the folder doesn't exist, create it
            try:
                os.makedirs(folder_path)
                print("Folder created.")
            except Exception as e:
                print(f"Failed to create folder. Reason: {e}")



    def get_faiss_embeddings(self):
        '''
        Splits all the documents, saves them in text format
        '''
        # Get a list of all PDFs in the specified directory
        list_pdfs = os.listdir(self.data_dir)
        # Initialize OPENAI embeddings
        embedding = OpenAIEmbeddings()

        # Make directories for each pdf separately 
        pdf_names = []
        pdf_num = []
        dir_num = 0
        text_count = 0
        for pdf in list_pdfs:
            dir_num += 1
            new_dir = os.path.join(self.out_dir, str(dir_num))
            os.makedirs(new_dir)
            print('Creating Database for PDF ' + str(dir_num))
            pdf_file = os.path.join(self.data_dir, pdf)
            reader = PdfReader(pdf_file)

            # Get the textual content of PDF
            raw_text = ''
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    raw_text += text

            # Split the texts 
            text_splitter = CharacterTextSplitter(        
                separator = "\n",
                chunk_size = 1000,
                chunk_overlap  = 200, 
                length_function = len,
            )
            texts = text_splitter.split_text(raw_text)
            text_count += len(raw_text)
            print('Length of text: ' + str(len(raw_text)))
            # Create Embedding
            db = FAISS.from_texts(texts, embedding)

            # Save Embedding
            db.save_local(os.path.join(new_dir, "faiss_index"))

            pdf_names.append(pdf)
            pdf_num.append(dir_num)
        
        data_df = {
            "names": pdf_names,
            "index": pdf_num
        }
        df = pd.DataFrame(data_df)
        map_name = os.path.split(self.out_dir)[-1]
        df.to_csv(os.path.join("outputs", "mappings", str(map_name) + ".csv"))
        print('Total text in data: ' + str(text_count))

        return None

    def get_combined_faiss_embedding(self):
        '''
        Combines all the documents, saves them in ChromaDB format
        '''
        # Get a list of all PDFs in the specified directory
        list_pdfs = os.listdir(self.data_dir)
        # Initialize OPENAI embeddings
        embedding = OpenAIEmbeddings()

        raw_text = ''
        for pdf in list_pdfs:
            print('Creating Database for PDF ' + str(pdf))
            pdf_file = os.path.join(self.data_dir, pdf)
            reader = PdfReader(pdf_file)

            # Get the textual content of PDF
            for i, page in enumerate(reader.pages):
                text = page.extract_text()
                if text:
                    raw_text += text

            # Split the texts 
        text_splitter = CharacterTextSplitter(        
            separator = "\n",
            chunk_size = 1000,
            chunk_overlap  = 200, 
            length_function = len,
        )
        texts = text_splitter.split_text(raw_text)
        text_count = len(raw_text)
        print('Length of text: ' + str(len(raw_text)))
        # Create Embedding
        db = FAISS.from_texts(texts, embedding)

        # Save Embedding
        db.save_local(os.path.join(self.out_dir, "faiss_index"))

        print('Total text in data: ' + str(text_count))