YashDave commited on
Commit
997996d
·
verified ·
1 Parent(s): 3abd1c3

Update app_config.py

Browse files
Files changed (1) hide show
  1. app_config.py +20 -27
app_config.py CHANGED
@@ -1,29 +1,21 @@
1
  import tiktoken
2
  from langchain_text_splitters import RecursiveCharacterTextSplitter
3
  from langchain_chroma import Chroma
4
- from langchain_community.embeddings import HuggingFaceBgeEmbeddings
5
- from langchain.document_loaders import PyPDFLoader
6
- from langchain.memory import ConversationSummaryBufferMemory
7
  from langchain_groq import ChatGroq
8
  import os
9
  from dotenv import load_dotenv
10
 
11
-
12
  # Load environment variables from .env file
13
  load_dotenv()
14
  tokenizer = tiktoken.get_encoding('cl100k_base')
15
- FILE_NAMEs = os.listdir('data')
16
-
17
- # system_template = """ you are LIC Customer Service Chatbot.
18
- # Use the following pieces of context to answer the user's question.
19
- # If you don't know the answer, just say that you don't know, don't try to make up an answer.
20
- # ----------------
21
- # {context}"""
22
-
23
 
24
  SYSTEM_PROMPT = """
25
  You are an insurance policy expert bot. You have different policies which can be found in company list.
26
- Here is the list of companies providng this policies
27
  Your tasks when user asks question:
28
  1. Familiarize themselves with the policy terms and conditions.
29
  2. Clear any doubts they may have about the policy.
@@ -45,43 +37,44 @@ VECTOR_MAX_TOKENS = 100
45
  VECTORS_TOKEN_OVERLAP_SIZE = 20
46
  NUMBER_OF_VECTORS_FOR_RAG = 7
47
 
48
-
49
-
50
- # create the length function
51
  def tiktoken_len(text):
52
- tokens = tokenizer.encode(
53
- text,
54
- disallowed_special=()
55
- )
56
  return len(tokens)
 
57
  def get_vectorstore():
58
  model_name = "BAAI/bge-small-en"
59
  model_kwargs = {"device": "cpu"}
60
  encode_kwargs = {"normalize_embeddings": True}
61
- hf = HuggingFaceBgeEmbeddings(
62
  model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
63
  )
 
64
  all_splits = []
65
  for file_name in FILE_NAMEs:
66
  if file_name.endswith(".pdf"):
67
- loader = PyPDFLoader(os.path.join("data",file_name))
68
  data = loader.load()[0].page_content
69
  else:
70
- with open(os.path.join("data",file_name), "r") as f:
71
  data = f.read()
72
  text_splitter = RecursiveCharacterTextSplitter(
73
  chunk_size=VECTOR_MAX_TOKENS,
74
  chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
75
  length_function=tiktoken_len,
76
- separators=["\n\n\n","\n\n", "\n", " ", ""]
77
  )
78
  all_splits = all_splits + text_splitter.split_text(data)
79
 
80
- vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf)
 
 
 
 
 
 
81
  return vectorstore
82
 
83
-
84
  chat = ChatGroq(temperature=0, groq_api_key=os.getenv("GROQ_API_KEY"), model_name="llama3-8b-8192", streaming=True)
85
  rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)
86
-
87
  my_vector_store = get_vectorstore()
 
1
  import tiktoken
2
  from langchain_text_splitters import RecursiveCharacterTextSplitter
3
  from langchain_chroma import Chroma
4
+ from langchain_huggingface import HuggingFaceEmbeddings # Updated import
5
+ from langchain_community.document_loaders import PyPDFLoader # Updated import
6
+ from langchain.memory import ConversationSummaryBufferMemory # Remains the same for now
7
  from langchain_groq import ChatGroq
8
  import os
9
  from dotenv import load_dotenv
10
 
 
11
  # Load environment variables from .env file
12
  load_dotenv()
13
  tokenizer = tiktoken.get_encoding('cl100k_base')
14
+ FILE_NAMEs = os.listdir('data')
 
 
 
 
 
 
 
15
 
16
  SYSTEM_PROMPT = """
17
  You are an insurance policy expert bot. You have different policies which can be found in company list.
18
+ Here is the list of companies providing these policies
19
  Your tasks when user asks question:
20
  1. Familiarize themselves with the policy terms and conditions.
21
  2. Clear any doubts they may have about the policy.
 
37
  VECTORS_TOKEN_OVERLAP_SIZE = 20
38
  NUMBER_OF_VECTORS_FOR_RAG = 7
39
 
40
+ # Create the length function
 
 
41
  def tiktoken_len(text):
42
+ tokens = tokenizer.encode(text, disallowed_special=())
 
 
 
43
  return len(tokens)
44
+
45
  def get_vectorstore():
46
  model_name = "BAAI/bge-small-en"
47
  model_kwargs = {"device": "cpu"}
48
  encode_kwargs = {"normalize_embeddings": True}
49
+ hf = HuggingFaceEmbeddings(
50
  model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
51
  )
52
+ persist_directory = "./chroma_db" # Directory to save the vector store
53
  all_splits = []
54
  for file_name in FILE_NAMEs:
55
  if file_name.endswith(".pdf"):
56
+ loader = PyPDFLoader(os.path.join("data", file_name))
57
  data = loader.load()[0].page_content
58
  else:
59
+ with open(os.path.join("data", file_name), "r") as f:
60
  data = f.read()
61
  text_splitter = RecursiveCharacterTextSplitter(
62
  chunk_size=VECTOR_MAX_TOKENS,
63
  chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
64
  length_function=tiktoken_len,
65
+ separators=["\n\n\n", "\n\n", "\n", " ", ""]
66
  )
67
  all_splits = all_splits + text_splitter.split_text(data)
68
 
69
+ # Check if the vector store already exists
70
+ if os.path.exists(persist_directory):
71
+ vectorstore = Chroma(persist_directory=persist_directory, embedding_function=hf)
72
+ else:
73
+ vectorstore = Chroma.from_texts(
74
+ texts=all_splits, embedding=hf, persist_directory=persist_directory
75
+ )
76
  return vectorstore
77
 
 
78
  chat = ChatGroq(temperature=0, groq_api_key=os.getenv("GROQ_API_KEY"), model_name="llama3-8b-8192", streaming=True)
79
  rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)
 
80
  my_vector_store = get_vectorstore()