vinhnx90 commited on
Commit
9caad80
1 Parent(s): cee7091

Improve performance with contextual compression, a technique where retrieved documents are compressed, and irrelevant information is filtered out.

Browse files
Files changed (3) hide show
  1. app.py +2 -2
  2. document_retriever.py +15 -11
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,12 +1,11 @@
1
  import streamlit as st
2
- from langchain.chains import ConversationalRetrievalChain
3
  from langchain.memory import ConversationBufferMemory
4
  from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
5
  from langchain_community.chat_models import ChatOpenAI
6
-
7
  from calback_handler import PrintRetrievalHandler, StreamHandler
8
  from chat_profile import ChatProfileRoleEnum
9
  from document_retriever import configure_retriever
 
10
 
11
  st.set_page_config(
12
  page_title="InkChatGPT: Chat with Documents",
@@ -79,6 +78,7 @@ with chat_tab:
79
  retriever=result_retriever,
80
  memory=memory,
81
  verbose=False,
 
82
  )
83
 
84
  avatars = {
 
1
  import streamlit as st
 
2
  from langchain.memory import ConversationBufferMemory
3
  from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
4
  from langchain_community.chat_models import ChatOpenAI
 
5
  from calback_handler import PrintRetrievalHandler, StreamHandler
6
  from chat_profile import ChatProfileRoleEnum
7
  from document_retriever import configure_retriever
8
+ from langchain.chains import ConversationalRetrievalChain
9
 
10
  st.set_page_config(
11
  page_title="InkChatGPT: Chat with Documents",
 
78
  retriever=result_retriever,
79
  memory=memory,
80
  verbose=False,
81
+ max_tokens_limit=4000,
82
  )
83
 
84
  avatars = {
document_retriever.py CHANGED
@@ -2,19 +2,16 @@ import os
2
  import tempfile
3
 
4
  import streamlit as st
5
- from langchain_community.document_loaders import (
6
- Docx2txtLoader,
7
- PyPDFLoader,
8
- TextLoader,
9
- UnstructuredEPubLoader,
10
- )
11
  from langchain_community.embeddings import HuggingFaceEmbeddings
12
  from langchain_community.vectorstores import DocArrayInMemorySearch
13
  from langchain_text_splitters import RecursiveCharacterTextSplitter
14
 
15
 
16
  @st.cache_resource(ttl="1h")
17
- def configure_retriever(files):
18
  # Read documents
19
  docs = []
20
  temp_dir = tempfile.TemporaryDirectory()
@@ -32,8 +29,6 @@ def configure_retriever(files):
32
  loader = Docx2txtLoader(temp_filepath)
33
  elif extension == ".txt":
34
  loader = TextLoader(temp_filepath)
35
- elif extension == ".epub":
36
- loader = UnstructuredEPubLoader(temp_filepath)
37
  else:
38
  st.write("This document format is not supported!")
39
  return None
@@ -45,7 +40,7 @@ def configure_retriever(files):
45
  splits = text_splitter.split_documents(docs)
46
 
47
  # Create embeddings and store in vectordb
48
- embeddings = HuggingFaceEmbeddings(model_name="all-mpnet-base-v2")
49
  vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
50
 
51
  # Define retriever
@@ -53,4 +48,13 @@ def configure_retriever(files):
53
  search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4}
54
  )
55
 
56
- return retriever
 
 
 
 
 
 
 
 
 
 
2
  import tempfile
3
 
4
  import streamlit as st
5
+ from langchain.retrievers import ContextualCompressionRetriever
6
+ from langchain.retrievers.document_compressors import EmbeddingsFilter
7
+ from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader, TextLoader
 
 
 
8
  from langchain_community.embeddings import HuggingFaceEmbeddings
9
  from langchain_community.vectorstores import DocArrayInMemorySearch
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
 
12
 
13
  @st.cache_resource(ttl="1h")
14
+ def configure_retriever(files, use_compression=False):
15
  # Read documents
16
  docs = []
17
  temp_dir = tempfile.TemporaryDirectory()
 
29
  loader = Docx2txtLoader(temp_filepath)
30
  elif extension == ".txt":
31
  loader = TextLoader(temp_filepath)
 
 
32
  else:
33
  st.write("This document format is not supported!")
34
  return None
 
40
  splits = text_splitter.split_documents(docs)
41
 
42
  # Create embeddings and store in vectordb
43
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
44
  vectordb = DocArrayInMemorySearch.from_documents(splits, embeddings)
45
 
46
  # Define retriever
 
48
  search_type="mmr", search_kwargs={"k": 2, "fetch_k": 4}
49
  )
50
 
51
+ if not use_compression:
52
+ return retriever
53
+
54
+ embeddings_filter = EmbeddingsFilter(
55
+ embeddings=embeddings, similarity_threshold=0.76
56
+ )
57
+
58
+ return ContextualCompressionRetriever(
59
+ base_compressor=embeddings_filter, base_retriever=retriever
60
+ )
requirements.txt CHANGED
@@ -7,4 +7,5 @@ streamlit_chat
7
  streamlit-extras
8
  pypdf
9
  docx2txt
10
- unstructured
 
 
7
  streamlit-extras
8
  pypdf
9
  docx2txt
10
+ unstructured
11
+ tiktoken