HarshSanghavi
commited on
code setup for chatbot
Browse files- .gitattributes +3 -0
- Document.pdf +3 -0
- GPT OUTPUT.docx +3 -0
- GPT OUTPUT.pdf +3 -0
- app.py +104 -0
- app_config.py +19 -0
- functions.py +63 -0
- requirements.txt +11 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
Document.pdf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
GPT[[:space:]]OUTPUT.docx filter=lfs diff=lfs merge=lfs -text
|
38 |
+
GPT[[:space:]]OUTPUT.pdf filter=lfs diff=lfs merge=lfs -text
|
Document.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6e3f9050436b8378c016a68fed3dc1496fedfc2e2eb0e993895d234e3aaabb3a
|
3 |
+
size 7575218
|
GPT OUTPUT.docx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1a876e10d48280e2e27551bcb4357c4dfd6339b5201c0343c074574372dd6e2
|
3 |
+
size 1386219
|
GPT OUTPUT.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:78f662edaf06fef24c4aa0953ee776d8fad70eee2a4b433209029d08bc75ff17
|
3 |
+
size 1351401
|
app.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import random
|
3 |
+
from app_config import SYSTEM_PROMPT, NLP_MODEL_NAME, NUMBER_OF_VECTORS_FOR_RAG, NLP_MODEL_TEMPERATURE, NLP_MODEL_MAX_TOKENS, VECTOR_MAX_TOKENS
|
4 |
+
from functions import get_vectorstore_with_doc_from_pdf, tiktoken_len, get_vectorstore_with_doc_from_word
|
5 |
+
from langchain.memory import ConversationSummaryBufferMemory
|
6 |
+
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
|
7 |
+
from langchain.chains.summarize import load_summarize_chain
|
8 |
+
from langchain.prompts import PromptTemplate
|
9 |
+
from langchain_groq import ChatGroq
|
10 |
+
from dotenv import load_dotenv
|
11 |
+
from pathlib import Path
|
12 |
+
import os
|
13 |
+
from streamlit_pdf_viewer import pdf_viewer
|
14 |
+
env_path = Path('.') / '.env'
|
15 |
+
load_dotenv(dotenv_path=env_path)
|
16 |
+
|
17 |
+
def response_generator(prompt: str) -> str:
|
18 |
+
"""this function can be used for general quetion answers which are related to tyrex and tyre recycling
|
19 |
+
|
20 |
+
Args:
|
21 |
+
prompt (string): user query
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
string: answer of the query
|
25 |
+
"""
|
26 |
+
|
27 |
+
try:
|
28 |
+
retriever = st.session_state.retriever
|
29 |
+
docs = retriever.invoke(prompt)
|
30 |
+
my_context = [doc.page_content for doc in docs]
|
31 |
+
my_context = '\n\n'.join(my_context)
|
32 |
+
|
33 |
+
system_message = SystemMessage(content = SYSTEM_PROMPT.format(context=my_context, previous_message_summary=st.session_state.rag_memory.moving_summary_buffer))
|
34 |
+
chat_messages = (system_message + st.session_state.rag_memory.chat_memory.messages + HumanMessage(content=prompt)).messages
|
35 |
+
print("total tokens: ", tiktoken_len(str(chat_messages)))
|
36 |
+
# print("my_context*********",my_context)
|
37 |
+
response = st.session_state.llm.invoke(chat_messages)
|
38 |
+
return response.content
|
39 |
+
|
40 |
+
except Exception as error:
|
41 |
+
print(error)
|
42 |
+
return "Oops! something went wrong, please try again."
|
43 |
+
|
44 |
+
|
45 |
+
st.markdown(
|
46 |
+
"""
|
47 |
+
<style>
|
48 |
+
.st-emotion-cache-janbn0 {
|
49 |
+
flex-direction: row-reverse;
|
50 |
+
text-align: right;
|
51 |
+
}
|
52 |
+
</style>
|
53 |
+
""",
|
54 |
+
unsafe_allow_html=True,
|
55 |
+
)
|
56 |
+
|
57 |
+
# When user gives input
|
58 |
+
with st.sidebar:
|
59 |
+
st.header("Hitachi Support Bot")
|
60 |
+
button = st.toggle("View Doc file.")
|
61 |
+
|
62 |
+
if button:
|
63 |
+
pdf_viewer("GPT OUTPUT.pdf")
|
64 |
+
else:
|
65 |
+
print("SYSTEM MESSAGE")
|
66 |
+
if "messages" not in st.session_state:
|
67 |
+
st.session_state.messages=[{"role": "system", "content": SYSTEM_PROMPT}]
|
68 |
+
|
69 |
+
print("SYSTEM MODEL")
|
70 |
+
if "llm" not in st.session_state:
|
71 |
+
st.session_state.llm = ChatGroq(temperature=NLP_MODEL_TEMPERATURE, groq_api_key=str(os.getenv('GROQ_API_KEY')), model_name=NLP_MODEL_NAME)
|
72 |
+
|
73 |
+
print("rag")
|
74 |
+
if "rag_memory" not in st.session_state:
|
75 |
+
st.session_state.rag_memory = ConversationSummaryBufferMemory(llm=st.session_state.llm, max_token_limit= 5000)
|
76 |
+
|
77 |
+
print("retrival")
|
78 |
+
if "retriever" not in st.session_state:
|
79 |
+
# vector_store = get_vectorstore_with_doc_from_pdf('GPT OUTPUT.pdf')
|
80 |
+
vector_store = get_vectorstore_with_doc_from_word('GPT OUTPUT.docx')
|
81 |
+
st.session_state.retriever = vector_store.as_retriever(k=NUMBER_OF_VECTORS_FOR_RAG)
|
82 |
+
|
83 |
+
print("container")
|
84 |
+
# Display chat messages from history
|
85 |
+
container = st.container(height=700)
|
86 |
+
for message in st.session_state.messages:
|
87 |
+
if message["role"] != "system":
|
88 |
+
with container.chat_message(message["role"]):
|
89 |
+
st.write(message["content"])
|
90 |
+
|
91 |
+
if prompt := st.chat_input("Enter your query here... "):
|
92 |
+
with container.chat_message("user"):
|
93 |
+
st.write(prompt)
|
94 |
+
st.session_state.messages.append({"role":"user" , "content":prompt})
|
95 |
+
|
96 |
+
with container.chat_message("assistant"):
|
97 |
+
response = response_generator(prompt=prompt)
|
98 |
+
print("******************************************************** Response ********************************************************")
|
99 |
+
print("MY RESPONSE IS:", response)
|
100 |
+
st.write(response)
|
101 |
+
|
102 |
+
print("Response is:", response)
|
103 |
+
st.session_state.rag_memory.save_context({'input': prompt}, {'output': response})
|
104 |
+
st.session_state.messages.append({"role":"assistant" , "content":response})
|
app_config.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
SYSTEM_PROMPT = """
|
3 |
+
1. You are Support bot for hitachi corporation. You must answer of any user questions using context only.
|
4 |
+
2. if you can't provide the answer of the quetions then only tell them "Thank you for your question! I'm here to help with information related to Hitachi corporation.the answer of this question is not given in this video. If you have any queries about those topics, feel free to ask. For other questions, I recommend reaching out to the appropriate source." nothing else.
|
5 |
+
3. User can also give you some greetings like thank you, welcome, please, sorry etc... so you have to handle it appropriately without giving any unnecessary information which is not wanted by user.
|
6 |
+
4. any information must be answered from provided context only, you must not to answer outside to the context.
|
7 |
+
|
8 |
+
context: {context}
|
9 |
+
"""
|
10 |
+
|
11 |
+
|
12 |
+
NLP_MODEL_NAME = "llama3-70b-8192"
|
13 |
+
REASONING_MODEL_NAME = "mixtral-8x7b-32768"
|
14 |
+
REASONING_MODEL_TEMPERATURE = 0
|
15 |
+
NLP_MODEL_TEMPERATURE = 0
|
16 |
+
NLP_MODEL_MAX_TOKENS = 5400
|
17 |
+
VECTOR_MAX_TOKENS = 6000
|
18 |
+
VECTORS_TOKEN_OVERLAP_SIZE = 20
|
19 |
+
NUMBER_OF_VECTORS_FOR_RAG = 1
|
functions.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tiktoken
|
2 |
+
from langchain_text_splitters import CharacterTextSplitter
|
3 |
+
from langchain_chroma import Chroma
|
4 |
+
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
5 |
+
from langchain.document_loaders import PyMuPDFLoader,Docx2txtLoader
|
6 |
+
from transformers import pipeline
|
7 |
+
from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE
|
8 |
+
from langchain.docstore.document import Document
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
from pathlib import Path
|
11 |
+
import os
|
12 |
+
env_path = Path('.') / '.env'
|
13 |
+
load_dotenv(dotenv_path=env_path)
|
14 |
+
|
15 |
+
tokenizer = tiktoken.get_encoding('cl100k_base')
|
16 |
+
|
17 |
+
# create the length function
|
18 |
+
def tiktoken_len(text):
|
19 |
+
tokens = tokenizer.encode(
|
20 |
+
text,
|
21 |
+
disallowed_special=()
|
22 |
+
)
|
23 |
+
return len(tokens)
|
24 |
+
|
25 |
+
|
26 |
+
def get_vectorstore_with_doc_from_pdf(pdf_path):
|
27 |
+
model_name = "BAAI/bge-small-en"
|
28 |
+
model_kwargs = {"device": "cpu"}
|
29 |
+
encode_kwargs = {"normalize_embeddings": True}
|
30 |
+
hf = HuggingFaceBgeEmbeddings(
|
31 |
+
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
32 |
+
)
|
33 |
+
|
34 |
+
loader = PyMuPDFLoader(pdf_path)
|
35 |
+
documents = loader.load()
|
36 |
+
print(len(documents))
|
37 |
+
|
38 |
+
all_splits = [doc.page_content for doc in documents]
|
39 |
+
|
40 |
+
vectorstore = Chroma.from_texts(texts=all_splits, embedding=hf)
|
41 |
+
return vectorstore
|
42 |
+
|
43 |
+
def get_vectorstore_with_doc_from_word(word_path):
|
44 |
+
model_name = "BAAI/bge-small-en"
|
45 |
+
model_kwargs = {"device": "cpu"}
|
46 |
+
encode_kwargs = {"normalize_embeddings": True}
|
47 |
+
hf = HuggingFaceBgeEmbeddings(
|
48 |
+
model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
|
49 |
+
)
|
50 |
+
|
51 |
+
loader = Docx2txtLoader(word_path)
|
52 |
+
documents = loader.load()
|
53 |
+
text_splitter = CharacterTextSplitter(
|
54 |
+
separator="Page :",
|
55 |
+
)
|
56 |
+
|
57 |
+
# all_splits = text_splitter.split_text(data)
|
58 |
+
print(len(documents))
|
59 |
+
print("all splits ........................")
|
60 |
+
all_splits = text_splitter.split_text(documents[0].page_content)
|
61 |
+
print(len(all_splits))
|
62 |
+
vectorstore = Chroma.from_texts(texts=all_splits, embedding=hf)
|
63 |
+
return vectorstore
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
langchain
|
3 |
+
langchain_groq
|
4 |
+
python-dotenv
|
5 |
+
langchain_community
|
6 |
+
langchain_chroma
|
7 |
+
tiktoken
|
8 |
+
sentence_transformers
|
9 |
+
pymupdf
|
10 |
+
docx2txt
|
11 |
+
streamlit_pdf_viewer
|