vinhnx90 commited on
Commit
1ce831d
1 Parent(s): 68eaa27

Handle epub loader

Browse files
Files changed (2) hide show
  1. app.py +16 -9
  2. requirements.txt +2 -1
app.py CHANGED
@@ -2,18 +2,22 @@ import os
2
  import tempfile
3
 
4
  import streamlit as st
5
- from langchain.callbacks.base import BaseCallbackHandler
6
  from langchain.chains import ConversationalRetrievalChain
7
  from langchain.chat_models import ChatOpenAI
8
  from langchain.embeddings import HuggingFaceEmbeddings
9
  from langchain.memory import ConversationBufferMemory
10
  from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
- from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, TextLoader
 
 
 
 
 
13
  from langchain_community.vectorstores import DocArrayInMemorySearch
14
 
 
15
  from chat_profile import ChatProfileRoleEnum
16
- from calback_handler import StreamHandler, PrintRetrievalHandler
17
 
18
  # configs
19
  LLM_MODEL_NAME = "gpt-3.5-turbo"
@@ -42,11 +46,11 @@ msgs = StreamlitChatMessageHistory()
42
 
43
 
44
  @st.cache_resource(ttl="1h")
45
- def configure_retriever(uploaded_files):
46
  # Read documents
47
  docs = []
48
  temp_dir = tempfile.TemporaryDirectory()
49
- for file in uploaded_files:
50
  temp_filepath = os.path.join(temp_dir.name, file.name)
51
  with open(temp_filepath, "wb") as f:
52
  f.write(file.getvalue())
@@ -60,6 +64,8 @@ def configure_retriever(uploaded_files):
60
  loader = Docx2txtLoader(temp_filepath)
61
  elif extension == ".txt":
62
  loader = TextLoader(temp_filepath)
 
 
63
  else:
64
  st.write("This document format is not supported!")
65
  return None
@@ -86,10 +92,11 @@ def configure_retriever(uploaded_files):
86
  with st.sidebar.expander("Documents"):
87
  st.subheader("Files")
88
  uploaded_files = st.file_uploader(
89
- label="Select files", type=["pdf", "txt", "docx"], accept_multiple_files=True
 
 
90
  )
91
 
92
-
93
  with st.sidebar.expander("Setup"):
94
  st.subheader("API Key")
95
  openai_api_key = st.text_input("OpenAI API Key", type="password")
@@ -104,7 +111,7 @@ if not openai_api_key:
104
  st.stop()
105
 
106
  if uploaded_files:
107
- retriever = configure_retriever(uploaded_files)
108
 
109
  memory = ConversationBufferMemory(
110
  memory_key="chat_history", chat_memory=msgs, return_messages=True
@@ -119,7 +126,7 @@ if uploaded_files:
119
  )
120
 
121
  chain = ConversationalRetrievalChain.from_llm(
122
- llm, retriever=retriever, memory=memory, verbose=False
123
  )
124
 
125
  avatars = {
 
2
  import tempfile
3
 
4
  import streamlit as st
 
5
  from langchain.chains import ConversationalRetrievalChain
6
  from langchain.chat_models import ChatOpenAI
7
  from langchain.embeddings import HuggingFaceEmbeddings
8
  from langchain.memory import ConversationBufferMemory
9
  from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ from langchain_community.document_loaders import (
12
+ Docx2txtLoader,
13
+ PyPDFLoader,
14
+ TextLoader,
15
+ UnstructuredEPubLoader,
16
+ )
17
  from langchain_community.vectorstores import DocArrayInMemorySearch
18
 
19
+ from calback_handler import PrintRetrievalHandler, StreamHandler
20
  from chat_profile import ChatProfileRoleEnum
 
21
 
22
  # configs
23
  LLM_MODEL_NAME = "gpt-3.5-turbo"
 
46
 
47
 
48
  @st.cache_resource(ttl="1h")
49
+ def configure_retriever(files):
50
  # Read documents
51
  docs = []
52
  temp_dir = tempfile.TemporaryDirectory()
53
+ for file in files:
54
  temp_filepath = os.path.join(temp_dir.name, file.name)
55
  with open(temp_filepath, "wb") as f:
56
  f.write(file.getvalue())
 
64
  loader = Docx2txtLoader(temp_filepath)
65
  elif extension == ".txt":
66
  loader = TextLoader(temp_filepath)
67
+ elif extension == ".epub":
68
+ loader = UnstructuredEPubLoader(temp_filepath)
69
  else:
70
  st.write("This document format is not supported!")
71
  return None
 
92
  with st.sidebar.expander("Documents"):
93
  st.subheader("Files")
94
  uploaded_files = st.file_uploader(
95
+ label="Select files",
96
+ type=["pdf", "txt", "docx", "epub"],
97
+ accept_multiple_files=True,
98
  )
99
 
 
100
  with st.sidebar.expander("Setup"):
101
  st.subheader("API Key")
102
  openai_api_key = st.text_input("OpenAI API Key", type="password")
 
111
  st.stop()
112
 
113
  if uploaded_files:
114
+ result_retriever = configure_retriever(uploaded_files)
115
 
116
  memory = ConversationBufferMemory(
117
  memory_key="chat_history", chat_memory=msgs, return_messages=True
 
126
  )
127
 
128
  chain = ConversationalRetrievalChain.from_llm(
129
+ llm, retriever=result_retriever, memory=memory, verbose=False
130
  )
131
 
132
  avatars = {
requirements.txt CHANGED
@@ -6,4 +6,5 @@ streamlit
6
  streamlit_chat
7
  streamlit-extras
8
  pypdf
9
- docx2txt
 
 
6
  streamlit_chat
7
  streamlit-extras
8
  pypdf
9
+ docx2txt
10
+ unstructured