## langchain-0.0.129
# ! pip install -U langchain
# !pip install python-dotenv
# !pip install -U chromadb 
import os
# API keys redacted
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import UnstructuredPDFLoader

docs = DirectoryLoader('Data/Policies/').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 50)
all_splits = text_splitter.split_documents(docs)

embedding = OpenAIEmbeddings()
VECTOR_STORE_DIRECTORY = "Vector Store\\"
faiss_vectorstore = Chroma(persist_directory=VECTOR_STORE_DIRECTORY, embedding_function=OpenAIEmbeddings())
faiss_vectorstore = Chroma.from_documents(all_splits, embedding)
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
template = """
You are an Expert Policy Advisor.These Below are the Documents that are extracted from the different Policies.Your Job 
 is to Provide the Answer to below question based on the text below. 
 Here are few instructions for you to follow when answering a question.
 - When you didnt find the relevant answers from below text Just Say "I dont know this,Please contact your HRBP for more details."
 - These are policy Documents, When answering a question Do Not return in response that "This information is At Annex A/B".Provide a Complete response to request.
 - Try to answer the questions in bullet format if possible.
 - Use three sentences maximum to Answer the question in very concise manner
 
 
 {context}
 Question: {question}
 Helpful Answer:
 """
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
 llm,
 retriever=faiss_retriever,
 chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

qa_chain.run("what is leaves plicy?")

### Testing Different Document Loaders/ Splitters

#### Markdown Splitter
- This will split the documents based on their Headings. 
 - I think this will be better approach for catering our usecase. As the policies are divided into separate chunks based on the thier headings. I think Tables will also be catered in this case as well.

from langchain.document_loaders import PDFMinerLoader
from langchain.document_loaders import PDFMinerPDFasHTMLLoader

loader = PDFMinerPDFasHTMLLoader("Data\\Policies\\2.16 Role Based Entitlements Policy V7 22.pdf")
data = loader.load()

data = loader.load()[0]
