Spaces:

Chintan-Donda
/

KKMS-KSSW-HF

Runtime error

KKMS-KSSW-HF / kkms_kssw.py

Chintan Donda

Fixing bug, Uploading PDF files

e00db83 over 1 year ago

4.24 kB

	import os

	import utils.constants as constants_utils
	import utils.data_loader as data_loader_utils
	import utils.langchain_utils as langchain_utils
	import utils.weather as weather_utils
	import utils.mandi_price as mandi_utils
	import utils.translator as translator_utils

	from llama_index import GPTSimpleVectorIndex, SimpleDirectoryReader, GPTListIndex
	from langchain.indexes import VectorstoreIndexCreator
	from langchain.embeddings.openai import OpenAIEmbeddings
	from langchain.vectorstores import FAISS

	import warnings
	warnings.filterwarnings('ignore')



	class KKMS_KSSW:
	def __init__(self):
	self.index = None
	self.documents = []
	self.response = None

	# Instantiate langchain_utils class object
	self.langchain_utils_obj = langchain_utils.LANGCHAIN_UTILS()
	# Instantiate Mandi Price utils class object
	self.mandi_utils_obj = mandi_utils.MANDI_PRICE()
	# Instantiate Weather class object
	self.weather_utils_obj = weather_utils.WEATHER()
	# Instantiate translator_utils class object
	self.translator_utils_obj = translator_utils.TRANSLATOR()

	if not os.path.exists(constants_utils.DATA_PATH):
	os.makedirs(constants_utils.DATA_PATH)
	if not os.path.exists(constants_utils.OUTPUT_PATH):
	os.makedirs(constants_utils.OUTPUT_PATH)


	# Initialize index (vector store)
	def initialize_index(self, save_index_to_disk=True, index_type='GPTSimpleVectorIndex'):
	# Load the index from the saved index.json file
	if os.path.exists(constants_utils.INDEX_FILENAME):
	print(f'Loading pre-generated index from: {constants_utils.INDEX_FILENAME}')
	self.index = self.langchain_utils_obj.load_index(index_type='GPTSimpleVectorIndex', filepath=constants_utils.INDEX_FILENAME)
	else:
	# Load data from Docs
	if os.path.exists(constants_utils.DATA_PATH):
	doc_documents = SimpleDirectoryReader(constants_utils.DATA_PATH).load_data()
	self.documents = doc_documents[:]

	# Load data from PDFs only
	# pdf_documents = data_loader_utils.load_document(doc_type='pdf', doc_filepath=doc_filepath)

	# Load data from URLs & append it to the documents that we read from PDFs
	# url_documents = data_loader_utils.load_document(doc_type='url', urls=urls)
	# self.documents.extend(url_documents)

	# Build the Vector store for docs
	if index_type == 'GPTSimpleVectorIndex':
	self.index = GPTSimpleVectorIndex.from_documents(self.documents)
	elif index_type == 'FAISS':
	self.index = FAISS.from_documents(
	self.documents,
	OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'))
	)


	def merge_documents_from_different_sources(doc_documents, url_documents):
	# Build the Vector store for docs
	doc_index = GPTSimpleVectorIndex.from_documents(doc_documents)
	# Build the Vector store for URLs
	url_index = GPTSimpleVectorIndex.from_documents(url_documents)

	# Set summary of each index
	doc_index.set_text("index_from_docs")
	url_index.set_text("index_from_urls")

	# Merge index of different data sources
	self.index = GPTListIndex([doc_index])
	self.index.insert(url_index) # can also be passed directly as GPTListIndex([doc_index, url_index])

	return self.index


	if save_index_to_disk:
	# Save index to a index.json file
	print(f'Saving newly generated index: {constants_utils.INDEX_FILENAME}')

	if index_type == 'GPTSimpleVectorIndex':
	self.index.save_to_disk(constants_utils.INDEX_FILENAME)
	elif index_type == 'FAISS':
	self.index.save_local(constants_utils.INDEX_FILENAME)



	# Define query on index to retrieve the most relevant top K documents from the vector store
	def query(self,
	question,
	mode='default',
	response_mode="default",
	similarity_top_k=1,
	required_keywords=[],
	exclude_keywords=[],
	verbose=False
	):
	'''
	Args:
	mode: can be any of [default, embedding]
	response_mode: can be any of [default, compact, tree_summarize]
	'''

	# Querying the index
	self.response = self.index.query(question,
	mode=mode,
	response_mode=response_mode,
	similarity_top_k=similarity_top_k,
	required_keywords=required_keywords,
	exclude_keywords=exclude_keywords,
	verbose=verbose)

	return self.response