vinhnx90 commited on
Commit
0e17e2d
β€’
1 Parent(s): 4ee1085

Add new app icon. Refactor and cleanup.

Browse files
Files changed (6) hide show
  1. README.md +1 -1
  2. app.py +52 -107
  3. assets/app_icon.png +0 -0
  4. assets/icon.jpg +0 -0
  5. assets/large_icon.png +0 -0
  6. document_retriever.py +58 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  <div align="center">
2
- <img alt="app icon" height="196px" src="./assets/icon.jpg">
3
  </div>
4
 
5
  <div align="center">
 
1
  <div align="center">
2
+ <img alt="app icon" height="196px" src="./assets/app_icon.jpg">
3
  </div>
4
 
5
  <div align="center">
app.py CHANGED
@@ -1,27 +1,16 @@
1
- import os
2
- import tempfile
3
-
4
  import streamlit as st
 
5
  from langchain.chains import ConversationalRetrievalChain
6
  from langchain.chat_models import ChatOpenAI
7
- from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- from langchain_community.document_loaders import (
12
- Docx2txtLoader,
13
- PyPDFLoader,
14
- TextLoader,
15
- UnstructuredEPubLoader,
16
- )
17
- from langchain_community.vectorstores import DocArrayInMemorySearch
18
 
 
19
  from calback_handler import PrintRetrievalHandler, StreamHandler
20
  from chat_profile import ChatProfileRoleEnum
21
 
22
  # configs
23
  LLM_MODEL_NAME = "gpt-3.5-turbo"
24
- EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
25
 
26
  st.set_page_config(
27
  page_title=":books: InkChatGPT: Chat with Documents",
@@ -34,115 +23,71 @@ st.set_page_config(
34
  },
35
  )
36
 
37
- st.image("./assets/icon.jpg", width=100)
38
- st.header(
39
- ":gray[:books: InkChatGPT]",
40
- divider="blue",
41
- )
42
- st.write("**Chat** with Documents")
43
-
44
  # Setup memory for contextual conversation
45
  msgs = StreamlitChatMessageHistory()
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- @st.cache_resource(ttl="1h")
49
- def configure_retriever(files):
50
- # Read documents
51
- docs = []
52
- temp_dir = tempfile.TemporaryDirectory()
53
- for file in files:
54
- temp_filepath = os.path.join(temp_dir.name, file.name)
55
- with open(temp_filepath, "wb") as f:
56
- f.write(file.getvalue())
57
-
58
- _, extension = os.path.splitext(temp_filepath)
59
-
60
- # Load the file using the appropriate loader
61
- if extension == ".pdf":
62
- loader = PyPDFLoader(temp_filepath)
63
- elif extension == ".docx":
64
- loader = Docx2txtLoader(temp_filepath)
65
- elif extension == ".txt":
66
- loader = TextLoader(temp_filepath)
67
- elif extension == ".epub":
68
- loader = UnstructuredEPubLoader(temp_filepath)
69
- else:
70
- st.write("This document format is not supported!")
71
- return None
72
-
73
- # loader = PyPDFLoader(temp_filepath)
74
- docs.extend(loader.load())
75
-
76
- # Split documents
77
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
78
- splits = text_splitter.split_documents(docs)
79
-
80
- # Create embeddings and store in vectordb
81
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
82
- vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
83
-
84
- # Define retriever
85
- retriever = vectordb.as_retriever(
86
- search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4}
87
- )
88
-
89
- return retriever
90
-
91
-
92
- with st.sidebar.expander("Documents"):
93
- st.subheader("Files")
94
  uploaded_files = st.file_uploader(
95
  label="Select files",
96
  type=["pdf", "txt", "docx", "epub"],
97
  accept_multiple_files=True,
98
  )
99
 
100
- with st.sidebar.expander("Setup"):
101
- st.subheader("API Key")
102
- openai_api_key = st.text_input("OpenAI API Key", type="password")
103
 
104
- is_empty_chat_messages = len(msgs.messages) == 0
105
- if is_empty_chat_messages or st.button("Clear message history"):
106
- msgs.clear()
107
- msgs.add_ai_message("How can I help you?")
108
 
109
- if not openai_api_key:
110
- st.info("Please add your OpenAI API key in the sidebar to continue.")
111
- st.stop()
 
 
 
 
112
 
113
- if uploaded_files:
114
- result_retriever = configure_retriever(uploaded_files)
 
115
 
116
- memory = ConversationBufferMemory(
117
- memory_key="chat_history", chat_memory=msgs, return_messages=True
118
- )
 
119
 
120
- # Setup LLM and QA chain
121
- llm = ChatOpenAI(
122
- model_name=LLM_MODEL_NAME,
123
- openai_api_key=openai_api_key,
124
- temperature=0,
125
- streaming=True,
126
- )
127
-
128
- chain = ConversationalRetrievalChain.from_llm(
129
- llm, retriever=result_retriever, memory=memory, verbose=False
130
- )
131
 
132
- avatars = {
133
- ChatProfileRoleEnum.Human: "user",
134
- ChatProfileRoleEnum.AI: "assistant",
135
- }
136
-
137
- for msg in msgs.messages:
138
- st.chat_message(avatars[msg.type]).write(msg.content)
139
 
140
- if user_query := st.chat_input(placeholder="Ask me anything!"):
141
- st.chat_message("user").write(user_query)
 
 
142
 
143
- with st.chat_message("assistant"):
144
- retrieval_handler = PrintRetrievalHandler(st.empty())
145
- stream_handler = StreamHandler(st.empty())
146
- response = chain.run(
147
- user_query, callbacks=[retrieval_handler, stream_handler]
148
- )
 
 
 
 
1
  import streamlit as st
2
+
3
  from langchain.chains import ConversationalRetrievalChain
4
  from langchain.chat_models import ChatOpenAI
 
5
  from langchain.memory import ConversationBufferMemory
6
  from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
 
 
 
 
 
 
 
 
7
 
8
+ from document_retriever import configure_retriever
9
  from calback_handler import PrintRetrievalHandler, StreamHandler
10
  from chat_profile import ChatProfileRoleEnum
11
 
12
  # configs
13
  LLM_MODEL_NAME = "gpt-3.5-turbo"
 
14
 
15
  st.set_page_config(
16
  page_title=":books: InkChatGPT: Chat with Documents",
 
23
  },
24
  )
25
 
 
 
 
 
 
 
 
26
  # Setup memory for contextual conversation
27
  msgs = StreamlitChatMessageHistory()
28
 
29
+ with st.container():
30
+ col1, col2 = st.columns([0.2, 0.8])
31
+ with col1:
32
+ st.image(
33
+ "./assets/large_icon.png", use_column_width="always", output_format="PNG"
34
+ )
35
+ with col2:
36
+ st.header(":books: InkChatGPT")
37
+ st.write("**Chat** with Documents")
38
+ st.caption("Supports PDF, TXT, DOCX, EPUB β€’ Limit 200MB per file")
39
+
40
+ chat_tab, documents_tab, settings_tab = st.tabs(["Chat", "Documents", "Settings"])
41
+ with settings_tab:
42
+ openai_api_key = st.text_input("OpenAI API Key", type="password")
43
+ if len(msgs.messages) == 0 or st.button("Clear message history"):
44
+ msgs.clear()
45
+ msgs.add_ai_message("How can I help you?")
46
 
47
+ with documents_tab:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  uploaded_files = st.file_uploader(
49
  label="Select files",
50
  type=["pdf", "txt", "docx", "epub"],
51
  accept_multiple_files=True,
52
  )
53
 
54
+ with chat_tab:
55
+ if uploaded_files:
56
+ result_retriever = configure_retriever(uploaded_files)
57
 
58
+ memory = ConversationBufferMemory(
59
+ memory_key="chat_history", chat_memory=msgs, return_messages=True
60
+ )
 
61
 
62
+ # Setup LLM and QA chain
63
+ llm = ChatOpenAI(
64
+ model_name=LLM_MODEL_NAME,
65
+ openai_api_key=openai_api_key,
66
+ temperature=0,
67
+ streaming=True,
68
+ )
69
 
70
+ chain = ConversationalRetrievalChain.from_llm(
71
+ llm, retriever=result_retriever, memory=memory, verbose=False
72
+ )
73
 
74
+ avatars = {
75
+ ChatProfileRoleEnum.Human: "user",
76
+ ChatProfileRoleEnum.AI: "assistant",
77
+ }
78
 
79
+ for msg in msgs.messages:
80
+ st.chat_message(avatars[msg.type]).write(msg.content)
 
 
 
 
 
 
 
 
 
81
 
82
+ if not openai_api_key:
83
+ st.caption("πŸ”‘ Add your **OpenAI API key** on the `Settings` to continue.")
 
 
 
 
 
84
 
85
+ if user_query := st.chat_input(
86
+ placeholder="Ask me anything!", disabled=(not openai_api_key)
87
+ ):
88
+ st.chat_message("user").write(user_query)
89
 
90
+ with st.chat_message("assistant"):
91
+ retrieval_handler = PrintRetrievalHandler(st.empty())
92
+ stream_handler = StreamHandler(st.empty())
93
+ response = chain.run(user_query, callbacks=[retrieval_handler, stream_handler])
 
 
assets/app_icon.png ADDED
assets/icon.jpg DELETED
Binary file (49.5 kB)
 
assets/large_icon.png ADDED
document_retriever.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+
4
+ import streamlit as st
5
+ from langchain.embeddings import HuggingFaceEmbeddings
6
+ from langchain_community.document_loaders import (
7
+ Docx2txtLoader,
8
+ PyPDFLoader,
9
+ TextLoader,
10
+ UnstructuredEPubLoader,
11
+ )
12
+ from langchain_community.vectorstores import DocArrayInMemorySearch
13
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
14
+
15
+ EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
16
+
17
+
18
+ @st.cache_resource(ttl="1h")
19
+ def configure_retriever(files):
20
+ # Read documents
21
+ docs = []
22
+ temp_dir = tempfile.TemporaryDirectory()
23
+ for file in files:
24
+ temp_filepath = os.path.join(temp_dir.name, file.name)
25
+ with open(temp_filepath, "wb") as f:
26
+ f.write(file.getvalue())
27
+
28
+ _, extension = os.path.splitext(temp_filepath)
29
+
30
+ # Load the file using the appropriate loader
31
+ if extension == ".pdf":
32
+ loader = PyPDFLoader(temp_filepath)
33
+ elif extension == ".docx":
34
+ loader = Docx2txtLoader(temp_filepath)
35
+ elif extension == ".txt":
36
+ loader = TextLoader(temp_filepath)
37
+ elif extension == ".epub":
38
+ loader = UnstructuredEPubLoader(temp_filepath)
39
+ else:
40
+ st.write("This document format is not supported!")
41
+ return None
42
+
43
+ docs.extend(loader.load())
44
+
45
+ # Split documents
46
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
47
+ splits = text_splitter.split_documents(docs)
48
+
49
+ # Create embeddings and store in vectordb
50
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
51
+ vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
52
+
53
+ # Define retriever
54
+ retriever = vectordb.as_retriever(
55
+ search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4}
56
+ )
57
+
58
+ return retriever