Spaces:
Sleeping
Sleeping
Update app_config.py
Browse files- app_config.py +20 -27
app_config.py
CHANGED
@@ -1,29 +1,21 @@
|
|
1 |
import tiktoken
|
2 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
from langchain_chroma import Chroma
|
4 |
-
from
|
5 |
-
from
|
6 |
-
from langchain.memory import ConversationSummaryBufferMemory
|
7 |
from langchain_groq import ChatGroq
|
8 |
import os
|
9 |
from dotenv import load_dotenv
|
10 |
|
11 |
-
|
12 |
# Load environment variables from .env file
|
13 |
load_dotenv()
|
14 |
tokenizer = tiktoken.get_encoding('cl100k_base')
|
15 |
-
FILE_NAMEs
|
16 |
-
|
17 |
-
# system_template = """ you are LIC Customer Service Chatbot.
|
18 |
-
# Use the following pieces of context to answer the user's question.
|
19 |
-
# If you don't know the answer, just say that you don't know, don't try to make up an answer.
|
20 |
-
# ----------------
|
21 |
-
# {context}"""
|
22 |
-
|
23 |
|
24 |
SYSTEM_PROMPT = """
|
25 |
You are an insurance policy expert bot. You have different policies which can be found in company list.
|
26 |
-
Here is the list of companies
|
27 |
Your tasks when user asks question:
|
28 |
1. Familiarize themselves with the policy terms and conditions.
|
29 |
2. Clear any doubts they may have about the policy.
|
@@ -45,43 +37,44 @@ VECTOR_MAX_TOKENS = 100
|
|
45 |
VECTORS_TOKEN_OVERLAP_SIZE = 20
|
46 |
NUMBER_OF_VECTORS_FOR_RAG = 7
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
# create the length function
|
51 |
def tiktoken_len(text):
|
52 |
-
tokens = tokenizer.encode(
|
53 |
-
text,
|
54 |
-
disallowed_special=()
|
55 |
-
)
|
56 |
return len(tokens)
|
|
|
57 |
def get_vectorstore():
|
58 |
model_name = "BAAI/bge-small-en"
|
59 |
model_kwargs = {"device": "cpu"}
|
60 |
encode_kwargs = {"normalize_embeddings": True}
|
61 |
-
hf =
|
62 |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
63 |
)
|
|
|
64 |
all_splits = []
|
65 |
for file_name in FILE_NAMEs:
|
66 |
if file_name.endswith(".pdf"):
|
67 |
-
loader = PyPDFLoader(os.path.join("data",file_name))
|
68 |
data = loader.load()[0].page_content
|
69 |
else:
|
70 |
-
with open(os.path.join("data",file_name), "r") as f:
|
71 |
data = f.read()
|
72 |
text_splitter = RecursiveCharacterTextSplitter(
|
73 |
chunk_size=VECTOR_MAX_TOKENS,
|
74 |
chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
|
75 |
length_function=tiktoken_len,
|
76 |
-
separators=["\n\n\n","\n\n", "\n", " ", ""]
|
77 |
)
|
78 |
all_splits = all_splits + text_splitter.split_text(data)
|
79 |
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
return vectorstore
|
82 |
|
83 |
-
|
84 |
chat = ChatGroq(temperature=0, groq_api_key=os.getenv("GROQ_API_KEY"), model_name="llama3-8b-8192", streaming=True)
|
85 |
rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)
|
86 |
-
|
87 |
my_vector_store = get_vectorstore()
|
|
|
1 |
import tiktoken
|
2 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
from langchain_chroma import Chroma
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings # Updated import
|
5 |
+
from langchain_community.document_loaders import PyPDFLoader # Updated import
|
6 |
+
from langchain.memory import ConversationSummaryBufferMemory # Remains the same for now
|
7 |
from langchain_groq import ChatGroq
|
8 |
import os
|
9 |
from dotenv import load_dotenv
|
10 |
|
|
|
11 |
# Load environment variables from .env file
|
12 |
load_dotenv()
|
13 |
tokenizer = tiktoken.get_encoding('cl100k_base')
|
14 |
+
FILE_NAMEs = os.listdir('data')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
SYSTEM_PROMPT = """
|
17 |
You are an insurance policy expert bot. You have different policies which can be found in company list.
|
18 |
+
Here is the list of companies providing these policies
|
19 |
Your tasks when user asks question:
|
20 |
1. Familiarize themselves with the policy terms and conditions.
|
21 |
2. Clear any doubts they may have about the policy.
|
|
|
37 |
VECTORS_TOKEN_OVERLAP_SIZE = 20
|
38 |
NUMBER_OF_VECTORS_FOR_RAG = 7
|
39 |
|
40 |
+
# Create the length function
|
|
|
|
|
41 |
def tiktoken_len(text):
|
42 |
+
tokens = tokenizer.encode(text, disallowed_special=())
|
|
|
|
|
|
|
43 |
return len(tokens)
|
44 |
+
|
45 |
def get_vectorstore():
|
46 |
model_name = "BAAI/bge-small-en"
|
47 |
model_kwargs = {"device": "cpu"}
|
48 |
encode_kwargs = {"normalize_embeddings": True}
|
49 |
+
hf = HuggingFaceEmbeddings(
|
50 |
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
51 |
)
|
52 |
+
persist_directory = "./chroma_db" # Directory to save the vector store
|
53 |
all_splits = []
|
54 |
for file_name in FILE_NAMEs:
|
55 |
if file_name.endswith(".pdf"):
|
56 |
+
loader = PyPDFLoader(os.path.join("data", file_name))
|
57 |
data = loader.load()[0].page_content
|
58 |
else:
|
59 |
+
with open(os.path.join("data", file_name), "r") as f:
|
60 |
data = f.read()
|
61 |
text_splitter = RecursiveCharacterTextSplitter(
|
62 |
chunk_size=VECTOR_MAX_TOKENS,
|
63 |
chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE,
|
64 |
length_function=tiktoken_len,
|
65 |
+
separators=["\n\n\n", "\n\n", "\n", " ", ""]
|
66 |
)
|
67 |
all_splits = all_splits + text_splitter.split_text(data)
|
68 |
|
69 |
+
# Check if the vector store already exists
|
70 |
+
if os.path.exists(persist_directory):
|
71 |
+
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=hf)
|
72 |
+
else:
|
73 |
+
vectorstore = Chroma.from_texts(
|
74 |
+
texts=all_splits, embedding=hf, persist_directory=persist_directory
|
75 |
+
)
|
76 |
return vectorstore
|
77 |
|
|
|
78 |
chat = ChatGroq(temperature=0, groq_api_key=os.getenv("GROQ_API_KEY"), model_name="llama3-8b-8192", streaming=True)
|
79 |
rag_memory = ConversationSummaryBufferMemory(llm=chat, max_token_limit=3000)
|
|
|
80 |
my_vector_store = get_vectorstore()
|