File size: 4,760 Bytes
04f287e
 
 
 
 
 
 
 
7ae21d5
04f287e
 
 
a23643d
 
 
6702158
04f287e
 
 
6702158
 
 
 
04f287e
 
 
 
 
 
 
6702158
04f287e
 
 
 
 
1598ceb
04f287e
 
 
 
 
 
6702158
 
 
04f287e
 
6702158
04f287e
0a5f808
3b6480c
 
6702158
 
3b6480c
0a5f808
3b6480c
6702158
 
3b6480c
0a5f808
6702158
 
3b6480c
 
 
 
b87a19f
04f287e
3b6480c
6702158
 
5ded842
347dbcf
3b6db6d
347dbcf
6702158
 
 
5ded842
 
 
1598ceb
5ded842
6702158
5ded842
6702158
 
5ded842
46ce0b3
aaf6c15
 
 
 
 
 
 
0a5f808
fc4f679
5ded842
5930e3b
7ae21d5
 
5930e3b
 
 
e8fe387
7ae21d5
 
08bcc7a
9e55f11
7ae21d5
 
04f287e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import glob
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
from torch import cuda
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from auditqa.reports import files, report_list
device = 'cuda' if cuda.is_available() else 'cpu'

### This script is NO MORE IN USE ##### 
# Preprocessed report pdf is brought along with chunks and added to existing reports database

# path to the pdf files
path_to_data = "./data/pdf/"

def process_pdf():
    """
    this method reads through the files and report_list to create the vector database
    """
    # load all the files using PyMuPDFfLoader
    docs = {}
    for file in report_list:
        try:
            docs[file] = PyMuPDFLoader(path_to_data + file + '.pdf').load()
        except Exception as e:
            print("Exception: ", e)

    
    # text splitter based on the tokenizer of a model of your choosing
    # to make texts fit exactly a transformer's context window size
    # langchain text splitters: https://python.langchain.com/docs/modules/data_connection/document_transformers/
    chunk_size = 256
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
            AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5"),
            chunk_size=chunk_size,
            chunk_overlap=10,
            add_start_index=True,
            strip_whitespace=True,
            separators=["\n\n", "\n"],
    )
    #  we iterate through the files which contain information about its
    # 'source'=='category', 'subtype', these are used in UI for document selection
    #  which will be used later for filtering database
    all_documents = {}
    categories = list(files.keys())
    # iterate through 'source'
    for category in categories:
        print("documents splitting in source:",category)
        all_documents[category] = []
        subtypes = list(files[category].keys())
        # iterate through 'subtype' within the source
        # example source/category == 'District', has subtypes which is district names
        for subtype in subtypes:
            print("document splitting for subtype:",subtype)
            for file in files[category][subtype]:

                # create the chunks
                doc_processed = text_splitter.split_documents(docs[file])
                print("chunks in subtype:",subtype, "are:",len(doc_processed))

                # add metadata information 
                for doc in doc_processed:
                    doc.metadata["source"] = category
                    doc.metadata["subtype"] = subtype
                    doc.metadata["year"] = file[-4:]
                    doc.metadata["filename"] = file

                all_documents[category].append(doc_processed)
    
    # convert list of list to flat list
    for key, docs_processed in all_documents.items():
        docs_processed = [item for sublist in docs_processed for item in sublist]
        print("length of chunks in source:",key, "are:",len(docs_processed))
        all_documents[key] = docs_processed
    all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
    all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
    # define embedding model
    embeddings = HuggingFaceEmbeddings(
        model_kwargs = {'device': device},
        encode_kwargs = {'normalize_embeddings': True},
        model_name="BAAI/bge-large-en-v1.5"
    )
    # placeholder for collection
    qdrant_collections = {}
    
    
    for file,value in all_documents.items():
        if file == "allreports":
            print("emebddings for:",file)
            qdrant_collections[file] = Qdrant.from_documents(
                value,
                embeddings,
                location=":memory:", 
                collection_name=file,
            )
    print(qdrant_collections)
    print("vector embeddings done")
    return qdrant_collections

def get_local_qdrant(): 
    qdrant_collections = {}
    embeddings = HuggingFaceEmbeddings(
        model_kwargs = {'device': device},
        encode_kwargs = {'normalize_embeddings': True},
        model_name="BAAI/bge-en-icl")
    list_ = ['Consolidated','District','Ministry','allreports']
    for val in list_:
        client = QdrantClient(path=f"./data/{val}") 
        print(client.get_collections())
        qdrant_collections[val] = Qdrant(client=client, collection_name=val, embeddings=embeddings, )
    return qdrant_collections