{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "## langchain-0.0.129\n", "# ! pip install -U langchain" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: python-dotenv in d:\\anaconda3\\envs\\nlp\\lib\\site-packages (1.0.0)\n" ] } ], "source": [ "# !pip install python-dotenv" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# !pip install -U chromadb " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "# os.environ[\"OPENAI_API_KEY\"] = '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' \n", "os.environ[\"OPENAI_API_KEY\"] = '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'" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from langchain.indexes import VectorstoreIndexCreator\n", "from langchain.vectorstores import Chroma\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.document_loaders import DirectoryLoader\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.document_loaders import UnstructuredPDFLoader\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "docs = DirectoryLoader('Data/Policies/').load()\n", "text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 50)\n", "all_splits = text_splitter.split_documents(docs)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# bm25_retriever = BM25Retriever.from_documents(all_splits)\n", "\n", "# # initialize the bm25 retriever and faiss retriever\n", "# # bm25_retriever = BM25Retriever.from_texts(doc_list)\n", "# # bm25_retriever = BM25Retriever.from_documents(docs)\n", "# bm25_retriever.k = 2\n", "\n", "embedding = OpenAIEmbeddings()\n", "VECTOR_STORE_DIRECTORY = \"Vector Store\\\\\"\n", "faiss_vectorstore = Chroma(persist_directory=VECTOR_STORE_DIRECTORY, embedding_function=OpenAIEmbeddings())\n", "faiss_vectorstore = Chroma.from_documents(all_splits, embedding)\n", "faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={\"k\": 2})\n", "\n", "# initialize the ensemble retriever\n", "# ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5])\n", "\n", "from langchain.chains import RetrievalQA\n", "from langchain.prompts import PromptTemplate\n", "from langchain.chat_models import ChatOpenAI\n", "template = \"\"\"\n", "You are an Expert Policy Advisor.These Below are the Documents that are extracted from the different Policies.Your Job \n", " is to Provide the Answer to below question based on the text below. \n", " Here are few instructions for you to follow when answering a question.\n", " - When you didnt find the relevant answers from below text Just Say \"I dont know this,Please contact your HRBP for more details.\"\n", " - These are policy Documents, When answering a question Do Not return in response that \"This information is At Annex A/B\".Provide a Complete response to request.\n", " - Try to answer the questions in bullet format if possible.\n", " - Use three sentences maximum to Answer the question in very concise manner\n", " \n", " \n", " {context}\n", " Question: {question}\n", " Helpful Answer:\n", " \"\"\"\n", "QA_CHAIN_PROMPT = PromptTemplate.from_template(template)\n", "\n", "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0)\n", "\n", "qa_chain = RetrievalQA.from_chain_type(\n", " llm,\n", " retriever=faiss_retriever,\n", " chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT}\n", ")\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'- The leave policy allows employees to take time off from work for various reasons such as vacation, personal illness, or family emergencies.\\n- Employees are typically granted a certain number of paid leave days per year, which may vary based on their length of service or job level.\\n- The policy outlines the process for requesting and approving leave, as well as any restrictions or requirements for taking leave.'" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "qa_chain.run(\"what is leaves plicy?\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Testing Different Document Loaders/ Splitters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Markdown Splitter\n", "- This will split the documents based on their Headings. \n", " - I think this will be better approach for catering our usecase. As the policies are divided into separate chunks based on the thier headings. I think Tables will also be catered in this case as well." ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders import PDFMinerLoader\n", "from langchain.document_loaders import PDFMinerPDFasHTMLLoader" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "loader = PDFMinerPDFasHTMLLoader(\"Data\\\\Policies\\\\2.16 Role Based Entitlements Policy V7 22.pdf\")\n", "data = loader.load()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "data = loader.load()[0] " ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "
\n", "\n", "\n", "\n", "\n", "