File size: 31,940 Bytes
04e306a
 
 
b16454e
a447435
b16454e
a447435
 
 
 
b16454e
a447435
 
 
b16454e
a447435
 
 
b16454e
 
 
a447435
 
 
 
b16454e
 
 
 
 
 
 
 
 
a447435
 
 
b16454e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
import src.constants as constants_utils
import src.data_loader as data_loader_utils
import src.utils as utils

from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import chromadb
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.prompts import PromptTemplate
from llama_index import GPTSimpleVectorIndex, GPTListIndex
from langchain.vectorstores import FAISS

import pickle
import shutil
from typing import Dict, List, Optional

import os
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

import logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s %(levelname)s [%(name)s] %(message)s", level=logging.INFO, datefmt="%Y-%m-%d %H:%M:%S"
)

import warnings
warnings.filterwarnings('ignore')



class LANGCHAIN_UTILS:
    def __init__(self,
        index_type=constants_utils.INDEX_TYPE,
        load_from_existing_index_store=constants_utils.LOAD_FROM_EXISTING_INDEX_STORE
    ):
        self.index_type = index_type
        self.load_from_existing_index_store = load_from_existing_index_store
        
        # Temporary index in the current context for the doc_type in consideration
        self.index = None
        # Master index which contains data from multiple sources (PDF, Online PDF, Text files, URLs, etc. It gets updated on Uploading the data from new files/urls without downtime of the application on-demand.)
        self.master_index = None
        
        # Data source wise index
        self.index_category_doc_type_wise_index = dict(
            (ic, dict(
                (ds, None) for ds in list(constants_utils.DATA_SOURCES.values()))
        ) for ic in constants_utils.INDEX_CATEGORY)
        
        # Data loaded as a Document format in the current context for the doc_type in consideration
        self.documents = []

        # Instantiate data_loader_utils class object
        self.data_loader_utils_obj = data_loader_utils.DATA_LOADER()
        # Instantiate UTILS class object
        self.utils_obj = utils.UTILS()

        # Initialize embeddings (we can also use other embeddings)
        self.embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))


    def generate_prompt_template(
        self,
        prompt_type='general'
    ):
        prompt_template = ''

        if prompt_type == 'general':
            prompt_template = """Write a concise summary of the following:

            {text}

            SUMMARIZE IN ENGLISH:"""

        elif prompt_type == 'weather':
            prompt_template = """
                What would be the weather based on the below data:
                {text}
            """

        return prompt_template



    def get_textual_summary(
        self,
        text,
        chain_type="stuff",
        custom_prompt=True,
        prompt_type='general'
    ):
        texts = [text]
        docs = [Document(page_content=t) for t in texts[:3]]

        llm = OpenAI(temperature=0)
        if custom_prompt:
            prompt_template = self.generate_prompt_template(prompt_type)
            PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
            chain = load_summarize_chain(llm, chain_type=chain_type, prompt=PROMPT)
        else:
            chain = load_summarize_chain(llm, chain_type=chain_type)

        text_summary = chain.run(docs)
        return text_summary


    def get_weather_forecast_summary(
        self,
        text,
        chain_type="stuff"
    ):
        text = f"""
            What would be the weather based on the below data:
            {text}

            Give simple response without technical numbers which can be explained to human.
        """
        texts = [text]
        docs = [Document(page_content=t) for t in texts[:3]]

        llm = OpenAI(temperature=0)
        chain = load_summarize_chain(llm, chain_type=chain_type)
        text_summary = chain.run(docs)

        return text_summary


    def get_answer_from_para(
        self,
        para,
        question,
        chain_type="stuff",
        custom_prompt=True
    ):
        # Prepare data (Split paragraph into chunks of small documents)
        text_splitter = CharacterTextSplitter(chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE, chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP)
        texts = text_splitter.split_text(para)

        if self.index_type == 'FAISS':
            # Find similar docs that are relevant to the question
            docsearch = FAISS.from_texts(
                texts, self.embeddings,
                metadatas=[{"source": str(i)} for i in range(len(texts))]
            )

        elif self.index_type == 'Chroma':
            # Find similar docs that are relevant to the question
            docsearch = Chroma.from_texts(
                texts, self.embeddings,
                metadatas=[{"source": str(i)} for i in range(len(texts))]
            )

        # Search for the similar docs
        docs = docsearch.similarity_search(question, k=1)

        llm = OpenAI(temperature=0)
        # Create a Chain for question answering
        if custom_prompt:
            prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

            {context}

            Question: {question}
            Answer in English:"""

            PROMPT = PromptTemplate(
                template=prompt_template, input_variables=["context", "question"]
            )
            chain = load_qa_chain(llm, chain_type=chain_type, prompt=PROMPT)
        else:
            # chain = load_qa_with_sources_chain(llm, chain_type=chain_type)
            chain = load_qa_chain(llm, chain_type=chain_type)
            # chain.run(input_documents=docs, question=question)

        out_dict = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
        return out_dict['output_text']


    def load_documents(
        self,
        doc_type,
        doc_filepath='',
        urls=[]
    ):
        """
            Load data in Document format of the given doc_type from either doc_filepath or list of urls.
            It can load multiple files/urls in one shot.
            
            Args:
                doc_type: can be any of [pdf, online_pdf, urls, textfile]
                doc_filepath: can be a directory or a filepath
                urls: list of urls
        """

        logger.info(f'Loading {doc_type} data into Documents format')

        if doc_type == 'pdf':
            # Load data from PDFs stored in local directory
            self.documents.extend(
                self.data_loader_utils_obj.load_documents_from_pdf(
                    doc_filepath=doc_filepath,
                    doc_type=doc_type
                ))

        elif doc_type == 'online_pdf':
            # Load data from PDFs stored in local directory
            self.documents.extend(
                self.data_loader_utils_obj.load_documents_from_pdf(
                    urls=urls,
                    doc_type=doc_type
                ))

        elif doc_type == 'urls':
            # Load data from URLs
            self.documents.extend(
                self.data_loader_utils_obj.load_documents_from_urls(
                    urls=urls,
                    doc_type=doc_type
                ))

        elif doc_type == 'textfile':
            # Load data from text files & Convert texts into Document format
            self.documents.extend(
                self.convert_text_to_documents(
                    self.data_loader_utils_obj.load_documents_from_text(
                    doc_filepath=doc_filepath,
                    doc_type=doc_type
                )
            ))

        elif doc_type == 'directory':
            # Load data from local directory
            self.documents.extend(
                self.data_loader_utils_obj.load_documents_from_directory(
                    doc_filepath=doc_filepath,
                    doc_type=doc_type
                ))

        logger.info(f'{doc_type} data into Documents format loaded successfully!')


    def create_index(
        self
    ):
        logger.info(f'Creating index')

        if not self.documents:
            logger.warning(f'Empty documents. Index cannot be created!')
            return None

        ############## Build the Vector store for docs ##############
        # Vector store using Facebook AI Similarity Search
        if self.index_type == 'FAISS':
            text_splitter = CharacterTextSplitter(
                chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
                chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP,
            )
            self.documents = text_splitter.split_documents(self.documents)

            self.index = FAISS.from_documents(
                self.documents,
                self.embeddings
            )

        # Vector store using Chroma DB
        elif self.index_type == 'Chroma':
            if not os.path.exists(self.index_filepath):
                os.makedirs(self.index_filepath)

            text_splitter = CharacterTextSplitter(
                chunk_size=constants_utils.TEXT_SPLITTER_CHUNK_SIZE,
                chunk_overlap=constants_utils.TEXT_SPLITTER_CHUNK_OVERLAP
            )
            self.documents = text_splitter.split_documents(self.documents)
            self.index = Chroma.from_documents(
                self.documents,
                self.embeddings,
                persist_directory=self.index_filepath
            )

        # Vector store using GPT vector index
        elif self.index_type == 'GPTSimpleVectorIndex':
            self.index = GPTSimpleVectorIndex.from_documents(self.documents)

        logger.info(f'Index created successfully!')
        return self.index


    def get_index_filepath(
        self,
        index_category,
        doc_type
    ):
        if doc_type == 'master':
            self.index_filepath = os.path.join(
                constants_utils.OUTPUT_PATH, f'index_{index_category}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}.json')
        else:
            self.index_filepath = os.path.join(
                constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}') if self.index_type in ['FAISS', 'Chroma'] else os.path.join(constants_utils.OUTPUT_PATH, f'index_{index_category}', f'index_{doc_type}.json')

        return self.index_filepath


    def load_master_doctype_indices_for_index_category(
        self,
        index_category
    ):
        logger.info(f'Loading master and doc_type indices for: {index_category}')

        # Set master index of index_category = None
        self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None

        for doc_type in self.index_category_doc_type_wise_index[index_category].keys():
            self.index = None
            self.index_filepath = self.get_index_filepath(
                index_category=index_category,
                doc_type=doc_type
            )
            self.load_index()
            # Set master/doc_type index
            self.index_category_doc_type_wise_index[index_category][doc_type] = self.index

        logger.info(f'Master and doc_type indices for: {index_category} loaded successfully!')


    def load_create_index(
        self
    ):
        logger.info(f'Loading/Creating index for each index_category')

        for index_category in constants_utils.INDEX_CATEGORY:
            # Load master index_category index if self.load_from_existing_index_store == True
            if self.load_from_existing_index_store:
                self.load_master_doctype_indices_for_index_category(index_category)

            # For any reason, if master index is not loaded then create the new index/vector store
            if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
                logger.info(f'Creating a new Vector/Index store for: {index_category}')

                doc_filepath = os.path.join(constants_utils.DATA_PATH, index_category)
                urls = []

                # Build the Vector/Index store
                for doc_type in list(constants_utils.DATA_SOURCES.values()):
                    logger.info(f'Creating a new Vector/Index store for: {index_category} from data source: {doc_type}')

                    index = None
                    if doc_type in ['pdf', 'textfile']:
                        index = self.create_store_index(
                            doc_type=doc_type,
                            doc_filepath=doc_filepath,
                            index_category=index_category
                        )
                    else:
                        # Build the Vector/Index store from web urls
                        index = self.create_store_index(
                            doc_type=doc_type,
                            urls=urls,
                            index_category=index_category
                        )

                    if index:
                        self.index_category_doc_type_wise_index[index_category][doc_type] = index

                    logger.info(f'New Vector/Index store for: {index_category} from data source: {doc_type} created successfully!')

                logger.info(f'New Vector/Index store for: {index_category} created successfully!')

                # Merge index of each doc_type into a single index_category
                self.merge_store_master_index(
                    index_category=index_category
                )

        logger.info(f'Index for each index_category loaded successfully!')


    def create_store_index(
        self,
        doc_type='pdf',
        doc_filepath=constants_utils.DATA_PATH,
        urls=[],
        index_category=constants_utils.INDEX_CATEGORY[0]
    ):
        logger.info(f'Creating and storing {doc_type} index')

        self.documents = []
        self.index = None

        self.index_filepath = self.get_index_filepath(
            index_category=index_category,
            doc_type=doc_type
        )

        # Delete the old index file
        shutil.rmtree(self.index_filepath, ignore_errors=True)
        logger.info(f'{self.index_filepath} deleted.')
    
        # Load data in Documents format that can be consumed for index creation
        self.load_documents(
            doc_type,
            doc_filepath,
            urls
        )
        
        # Create the index from documents for search/retrieval
        self.index = self.create_index()

        # Store index
        self.store_index(
            index=self.index,
            index_filepath=self.index_filepath
        )

        logger.info(f'{doc_type} index created and stored successfully!')
        # Return the index of the given doc_type (this is an index for a single doc_type). Indices from multiple doc_types should be merged later on in the master index so that query could be made from a single index.
        return self.index


    def store_index(
        self,
        index,
        index_filepath
    ):
        logger.info(f'Saving index to: {index_filepath}')

        if not index:
            logger.warning(f'Cannot write an empty index to: {index_filepath}!')
            return

        if not os.path.exists(index_filepath):
            os.makedirs(index_filepath)

        if self.index_type == 'FAISS':
            index.save_local(index_filepath)

        elif self.index_type == 'Chroma':
            index.persist()

        elif self.index_type == 'GPTSimpleVectorIndex':
            index.save_to_disk(index_filepath)

        elif self.index_type == 'pickle':
            with open(index_filepath, "wb") as f:
                pickle.dump(index, f)

        logger.info(f'Index saved to: {index_filepath} successfully!')


    def load_index(
        self
    ):
        logger.info(f'Loading index from: {self.index_filepath}')

        if not os.path.exists(self.index_filepath):
            logger.warning(f"Cannot load index from {self.index_filepath} as the path doest not exist!")
            return
        
        if self.index_type == 'FAISS':
            self.index = FAISS.load_local(self.index_filepath, self.embeddings)

        elif self.index_type == 'Chroma':
            self.index = Chroma(
                persist_directory=self.index_filepath,
                embedding_function=self.embeddings
            )

        elif self.index_type == 'GPTSimpleVectorIndex':
            self.index = GPTSimpleVectorIndex.load_from_disk(self.index_filepath)

        elif self.index_type == 'pickle':
            with open(self.index_filepath, "rb") as f:
                self.index = pickle.load(f)

        logger.info(f'Index loaded from: {self.index_filepath} successfully!')


    def convert_text_to_documents(
        self,
        text_list=[]
    ):
        """
            Converts the list of text data to Documents format that can be feed to GPT API to build the Vector store
        """

        from llama_index import Document
        documents = [Document(t) for t in text_list]
        return documents


    def merge_documents_from_different_sources(
        self,
        doc_documents,
        url_documents
    ):
        # Build the Vector store for docs
        doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
        # Build the Vector store for URLs
        url_index = GPTSimpleVectorIndex.from_documents(url_documents)

        # Set summary of each index
        doc_index.set_text("index_from_docs")
        url_index.set_text("index_from_urls")

        # Merge index of different data sources
        index = GPTListIndex([doc_index, url_index])

        return index


    def merge_store_master_index(
        self,
        index_category
    ):
        """
        Merge multiple doc_type indices into a single master index. Query/search would be performed on this merged index.

        Args:
            index_category: index_category (can be any of: [crops, fruits, pest_management, govt_policy, soil, etc.])
        """
        logger.info('Merging doc_type indices of different index categories into a master index')

        self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = None
        doc_type_indices = self.index_category_doc_type_wise_index[index_category]

        if self.index_type == 'FAISS':
            for doc_type, index in doc_type_indices.items():
                if doc_type == constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE:
                    # Only merge the non-master doc_type_indices
                    continue
                if not index or not isinstance(index, FAISS):
                    logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.faiss.FAISS')
                    continue
                if not self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE]:
                    self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = index
                else:
                    self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE].merge_from(index)

        elif self.index_type == 'Chroma':
            for doc_type, index in doc_type_indices.items():
                if not index or not isinstance(index, Chroma):
                    logger.warning(f'{doc_type} index to be merged is not an instance of type langchain.vectorstores.Chroma')
                    continue
                raise NotImplementedError

        elif self.index_type == 'GPTSimpleVectorIndex':
            for doc_type, index in doc_type_indices.items():
                if not index or not isinstance(index, GPTSimpleVectorIndex):
                    logger.warning(f'{doc_type} index to be merged is not an instance of type llama_index.GPTSimpleVectorIndex')
                    continue
                raise NotImplementedError

        # Store index_category master index
        self.store_index(
            index=self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE],
            index_filepath=self.get_index_filepath(
                index_category=index_category,
                doc_type=constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE
            )
        )

        logger.info('doc_type indices of different index categories into a master index merged successfully!')


    def init_chromadb(self):
        logger.info('Initializing Chroma DB')

        if not os.path.exists(self.index_filepath):
            os.makedirs(self.index_filepath)

        client_settings = chromadb.config.Settings(
            chroma_db_impl="duckdb+parquet",
            persist_directory=self.index_filepath,
            anonymized_telemetry=False
        )

        self.index = Chroma(
            collection_name="langchain_store",
            embedding_function=self.embeddings,
            client_settings=client_settings,
            persist_directory=self.index_filepath,
        )

        logger.info('Chroma DB initialized successfully!')


    def query_chromadb(self, question, k=1):
        return self.index.similarity_search(query=question, k=k)


    def query(self,
        question,
        question_category,
        mode='embedding',
        response_mode="default",
        similarity_top_k=2,
        required_keywords=[],
        exclude_keywords=[],
        verbose=False
    ):
        '''
            Args:
                mode: can be any of [default, embedding]
                response_mode: can be any of [default, compact, tree_summarize]
        '''
        logger.info(f'question category: {question_category}; question: {question}')

        response = None

        # Get the index of the given question_category
        index = self.index_category_doc_type_wise_index[question_category]['master']

        if self.index_type == 'FAISS':
            response = index.similarity_search(
                question,
                k=similarity_top_k
            )

        elif self.index_type == 'Chroma':
            response = index.similarity_search(
                question,
                k=similarity_top_k
            )

        elif self.index_type == 'GPTSimpleVectorIndex':
            # Querying the index
            response = index.query(
                question,
                mode=mode,
                response_mode=response_mode,
                similarity_top_k=similarity_top_k,
                required_keywords=required_keywords,
                exclude_keywords=exclude_keywords,
                verbose=verbose
            )

        return response


    def load_uploaded_documents(
        self,
        doc_type,
        files_or_urls
    ):
        logger.info(f'Loading uploaded documents from: {doc_type}')

        if doc_type == 'pdf':
            if not isinstance(files_or_urls, list):
                files_or_urls = [files_or_urls]
            for pdf in files_or_urls:
                if not pdf.name.endswith('.pdf'):
                    logger.warning(f'Found a file other than .pdf format. Cannot load {pdf.name} file!')
                    continue
                logger.info(f'Loading PDF from: {pdf.name}')
                # Load PDF as documents
                self.documents.extend(
                    self.data_loader_utils_obj.load_documents_from_pdf(
                        doc_filepath=pdf.name,
                        doc_type=doc_type
                    )
                )

        elif doc_type == 'textfile':
            if not isinstance(files_or_urls, list):
                files_or_urls = [files_or_urls]
            for text_file in files_or_urls:
                if not text_file.name.endswith('.txt'):
                    logger.warning(f'Found a file other than .txt format. Cannot load {text_file.name} file!')
                    continue
                logger.info(f'Loading textfile from: {text_file.name}')
                # Load textfile as documents
                self.documents.extend(
                    self.data_loader_utils_obj.load_documents_from_text(
                        doc_filepath=text_file.name,
                        doc_type=doc_type
                    )
                )

        elif doc_type == 'online_pdf':
            files_or_urls = self.utils_obj.split_text(files_or_urls)
            # Load online_pdfs as documents
            self.documents.extend(
                self.data_loader_utils_obj.load_documents_from_pdf(
                    doc_type=doc_type,
                    urls=files_or_urls
                )
            )

        elif doc_type == 'urls':
            files_or_urls = self.utils_obj.split_text(files_or_urls)
            # Load URLs as documents
            self.documents.extend(
                self.data_loader_utils_obj.load_documents_from_urls(
                    doc_type=doc_type,
                    urls=files_or_urls
                )
            )

        logger.info(f'Uploaded documents from: {doc_type} loaded successfully!')


    def upload_data(
        self,
        doc_type,
        files_or_urls,
        index_category
    ):
        logger.info(f'Uploading data for: {index_category}-{doc_type}')

        self.documents = []
        self.index = None

        # Create documents of the uploaded files
        self.load_uploaded_documents(
            doc_type,
            files_or_urls
        )
        
        # Create the index from documents for search/retrieval
        self.index = self.create_index()

        # Update the existing index with the newly data
        self.upsert_index(
            doc_type=doc_type,
            index_category=index_category
        )

        logger.info(f'{index_category}-{doc_type} data uploaded successfully!')


    def upsert_index(
        self,
        doc_type,
        index_category
    ):
        """
            Updates the index of the given index_category-doc_type, if present.
            Creates a new index if index_category-doc_type index is not present.
            Also updates the master index for the given index_category.
        """
        logger.info(f'Upserting index for: {index_category}-{doc_type}')

        if not self.index_category_doc_type_wise_index.get(index_category, None):
            """
                If index_category index does not exists
                Steps:
                    - set index_category index
                    - set doc_type index
                    - Store new index_category index as master
                    - Store new doc_type index
            """
            logger.info(f'Master index does not exist for: {index_category}. A new {index_category} master index & {doc_type} index would be created.')
            self.index_category_doc_type_wise_index.setdefault(index_category, {})
            # Set a master index only if it doesn't exist. Else keep it's value as-it-is.
            self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
            # Set an index for the given doc_type only if it doesn't exist. Else keep it's value as-it-is.
            self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
        
        elif not self.index_category_doc_type_wise_index[index_category].get(doc_type, None):
            """
                If doc_type index does not exists
                Steps:
                    - set doc_type index
                    - if master index does not exist for the index_category - set a master index
                    - if master index exists - update the master index to merge it with doc_type index
                    - Store new/updated index_category index as master
                    - Store new doc_type index
            """
            logger.info(f'{doc_type} index does not exist for: {index_category}-{doc_type}. A new {doc_type} index would be created.')
            # create doc_type index
            self.index_category_doc_type_wise_index[index_category][doc_type] = self.index
            # if master index does not exist for the index_category - create a master index
            if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
                logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
                self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index

        else:
            """
                If the new document is of the existing index_category & doc_type
                Steps:
                    - if master index does not exist for the index_category - set a master index
                    - if master index exists - update the master index to merge it with doc_type index
                    - update the doc_type index
                    - Store updated index_category index as master
                    - Store updated doc_type index
            """
            # if master index does not exist for the index_category - create a master index
            if not self.index_category_doc_type_wise_index[index_category].get(constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE, None):
                logger.info(f'Master index does not exist for: {index_category}-{doc_type}. A new master index would be created.')
                self.index_category_doc_type_wise_index[index_category][constants_utils.INDEX_CATEGORY_MASTER_INDEX_DOC_TYPE] = self.index
            # Merge new self.index with existing doc_type index
            self.index_category_doc_type_wise_index[index_category][doc_type].merge_from(self.index)
            # Update self.index to store/overwrite the existing index with the updated index
            self.index = self.index_category_doc_type_wise_index[index_category][doc_type]
        

        # Store newly created/merged index
        self.store_index(
            index=self.index,
            index_filepath=self.get_index_filepath(
                index_category=index_category,
                doc_type=doc_type
            )
        )

        # Merge and store master index for index_category
        self.merge_store_master_index(
            index_category=index_category
        )

        logger.info(f'Index for: {index_category}-{doc_type} upserted successful!')


    def delete_index(
        self,
        ids: Optional[List[str]] = None,
        # filter: Optional[DocumentMetadataFilter] = None,
        delete_all: Optional[bool] = None,
    ):
        """
            Removes vectors by ids, filter, or everything in the datastore.
            Multiple parameters can be used at once.
            Returns whether the operation was successful.
        """
        logger.info(f'Deleting index')

        raise NotImplementedError

        # NOTE: we can delete a specific collection
        self.index.delete_collection()
        self.index.persist()

        # Or just nuke the persist directory
        # !rm -rf self.index_filepath