KKMS-KSSW-HF / kkms_kssw.py
Chintan Donda
Fixing bug, Uploading PDF files
e00db83
raw
history blame
4.24 kB
import os
import utils.constants as constants_utils
import utils.data_loader as data_loader_utils
import utils.langchain_utils as langchain_utils
import utils.weather as weather_utils
import utils.mandi_price as mandi_utils
import utils.translator as translator_utils
from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex
from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import warnings
warnings.filterwarnings('ignore')
class KKMS_KSSW:
def __init__(self):
self.index = None
self.documents = []
self.response = None
# Instantiate langchain_utils class object
self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS()
# Instantiate Mandi Price utils class object
self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
# Instantiate Weather class object
self.weather_utils_obj = weather_utils.WEATHER()
# Instantiate translator_utils class object
self.translator_utils_obj = translator_utils.TRANSLATOR()
if not os.path.exists(constants_utils.DATA_PATH):
os.makedirs(constants_utils.DATA_PATH)
if not os.path.exists(constants_utils.OUTPUT_PATH):
os.makedirs(constants_utils.OUTPUT_PATH)
# Initialize index (vector store)
def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'):
# Load the index from the saved index.json file
if os.path.exists(constants_utils.INDEX_FILENAME):
print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}')
self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME)
else:
# Load data from Docs
if os.path.exists(constants_utils.DATA_PATH):
doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data()
self.documents = doc_documents[:]
# Load data from PDFs only
# pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath)
# Load data from URLs & append it to the documents that we read from PDFs
# url_documents = data_loader_utils.load_document(doc_type='url', urls=urls)
# self.documents.extend(url_documents)
# Build the Vector store for docs
if index_type == 'GPTSimpleVectorIndex':
self.index = GPTSimpleVectorIndex.from_documents(self.documents)
elif index_type == 'FAISS':
self.index = FAISS.from_documents(
self.documents,
OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
)
def merge_documents_from_different_sources(doc_documents, url_documents):
# Build the Vector store for docs
doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
# Build the Vector store for URLs
url_index = GPTSimpleVectorIndex.from_documents(url_documents)
# Set summary of each index
doc_index.set_text("index_from_docs")
url_index.set_text("index_from_urls")
# Merge index of different data sources
self.index = GPTListIndex([doc_index])
self.index.insert(url_index) # can also be passed directly as GPTListIndex([doc_index, url_index])
return self.index
if save_index_to_disk:
# Save index to a index.json file
print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}')
if index_type == 'GPTSimpleVectorIndex':
self.index.save_to_disk(constants_utils.INDEX_FILENAME)
elif index_type == 'FAISS':
self.index.save_local(constants_utils.INDEX_FILENAME)
# Define query on index to retrieve the most relevant top K documents from the vector store
def query(self,
question,
mode='default',
response_mode="default",
similarity_top_k=1,
required_keywords=[],
exclude_keywords=[],
verbose=False
):
'''
Args:
mode: can be any of [default, embedding]
response_mode: can be any of [default, compact, tree_summarize]
'''
# Querying the index
self.response = self.index.query(question,
mode=mode,
response_mode=response_mode,
similarity_top_k=similarity_top_k,
required_keywords=required_keywords,
exclude_keywords=exclude_keywords,
verbose=verbose)
return self.response