Spaces:

bacancydataprophets
/

insurance_bot

Sleeping

App Files Files Community

YashDave commited on 13 days ago

Commit

997996d

verified ·

1 Parent(s): 3abd1c3

Update app_config.py

Browse files

Files changed (1) hide show

app_config.py +20 -27

app_config.py CHANGED Viewed

@@ -1,29 +1,21 @@
 import tiktoken
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_chroma import Chroma
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings
-from langchain.document_loaders import PyPDFLoader
-from langchain.memory import ConversationSummaryBufferMemory
 from langchain_groq import ChatGroq
 import os
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
 tokenizer = tiktoken.get_encoding('cl100k_base')
-FILE_NAMEs  = os.listdir('data')
-# system_template = """ you are LIC Customer Service Chatbot.
-# Use the following pieces of context to answer the user's question.
-# If you don't know the answer, just say that you don't know, don't try to make up an answer.
-# ----------------
-# {context}"""
 SYSTEM_PROMPT = """
 You are an insurance policy expert bot. You have different policies which can be found in company list.
-Here is the list of companies providng this policies
 Your tasks when user asks question:
 1. Familiarize themselves with the policy terms and conditions.
 2. Clear any doubts they may have about the policy.
@@ -45,43 +37,44 @@ VECTOR_MAX_TOKENS = 100
 VECTORS_TOKEN_OVERLAP_SIZE = 20
 NUMBER_OF_VECTORS_FOR_RAG = 7
-# create the length function
 def tiktoken_len(text):
-    tokens = tokenizer.encode(
-        text,
-        disallowed_special=()
-    )
     return len(tokens)
 def get_vectorstore():
     model_name = "BAAI/bge-small-en"
     model_kwargs = {"device": "cpu"}
     encode_kwargs = {"normalize_embeddings": True}
-    hf = HuggingFaceBgeEmbeddings(
         model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
     )
     all_splits = []
     for file_name in FILE_NAMEs:
         if file_name.endswith(".pdf"):
-            loader = PyPDFLoader(os.path.join("data",file_name))
             data = loader.load()[0].page_content
         else:
-            with open(os.path.join("data",file_name), "r") as f:
                 data = f.read()
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=VECTOR_MAX_TOKENS,
             chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
             length_function=tiktoken_len,
-            separators=["\n\n\n","\n\n", "\n", " ", ""]
         )
         all_splits = all_splits + text_splitter.split_text(data)
-    vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
     return vectorstore
 chat = ChatGroq(temperature=0, groq_api_key=os.getenv("GROQ_API_KEY"), model_name="llama3-8b-8192", streaming=True)
 rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)
 my_vector_store = get_vectorstore()

 import tiktoken
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_chroma import Chroma
+from langchain_huggingface import HuggingFaceEmbeddings  # Updated import
+from langchain_community.document_loaders import PyPDFLoader  # Updated import
+from langchain.memory import ConversationSummaryBufferMemory  # Remains the same for now
 from langchain_groq import ChatGroq
 import os
 from dotenv import load_dotenv
 # Load environment variables from .env file
 load_dotenv()
 tokenizer = tiktoken.get_encoding('cl100k_base')
+FILE_NAMEs = os.listdir('data')
 SYSTEM_PROMPT = """
 You are an insurance policy expert bot. You have different policies which can be found in company list.
+Here is the list of companies providing these policies
 Your tasks when user asks question:
 1. Familiarize themselves with the policy terms and conditions.
 2. Clear any doubts they may have about the policy.
 VECTORS_TOKEN_OVERLAP_SIZE = 20
 NUMBER_OF_VECTORS_FOR_RAG = 7
+# Create the length function
 def tiktoken_len(text):
+    tokens = tokenizer.encode(text, disallowed_special=())
     return len(tokens)
 def get_vectorstore():
     model_name = "BAAI/bge-small-en"
     model_kwargs = {"device": "cpu"}
     encode_kwargs = {"normalize_embeddings": True}
+    hf = HuggingFaceEmbeddings(
         model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
     )
+    persist_directory = "./chroma_db"  # Directory to save the vector store
     all_splits = []
     for file_name in FILE_NAMEs:
         if file_name.endswith(".pdf"):
+            loader = PyPDFLoader(os.path.join("data", file_name))
             data = loader.load()[0].page_content
         else:
+            with open(os.path.join("data", file_name), "r") as f:
                 data = f.read()
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=VECTOR_MAX_TOKENS,
             chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
             length_function=tiktoken_len,
+            separators=["\n\n\n", "\n\n", "\n", " ", ""]
         )
         all_splits = all_splits + text_splitter.split_text(data)
+    # Check if the vector store already exists
+    if os.path.exists(persist_directory):
+        vectorstore = Chroma(persist_directory=persist_directory, embedding_function=hf)
+    else:
+        vectorstore = Chroma.from_texts(
+            texts=all_splits, embedding=hf, persist_directory=persist_directory
+        )
     return vectorstore
 chat = ChatGroq(temperature=0, groq_api_key=os.getenv("GROQ_API_KEY"), model_name="llama3-8b-8192", streaming=True)
 rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)
 my_vector_store = get_vectorstore()