Update app.py
Browse files
app.py
CHANGED
@@ -55,8 +55,10 @@ st.write("---------------------------------")
|
|
55 |
|
56 |
st.write("LIST OF ALL THE LOADED DOCUMENTS: ")
|
57 |
st.write("")
|
58 |
-
pdf_files
|
59 |
-
|
|
|
|
|
60 |
st.write(file)
|
61 |
|
62 |
st.write("---------------------------------")
|
@@ -121,6 +123,43 @@ if "vector" not in st.session_state:
|
|
121 |
loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
|
122 |
docs = loader.load()
|
123 |
st.session_state.docs = docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
126 |
st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
|
|
|
55 |
|
56 |
st.write("LIST OF ALL THE LOADED DOCUMENTS: ")
|
57 |
st.write("")
|
58 |
+
# pdf_files = glob.glob("*.pdf")
|
59 |
+
word_files = glob.glob("*.docx")
|
60 |
+
# for file in pdf_files:
|
61 |
+
for file in word_files:
|
62 |
st.write(file)
|
63 |
|
64 |
st.write("---------------------------------")
|
|
|
123 |
loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
|
124 |
docs = loader.load()
|
125 |
st.session_state.docs = docs
|
126 |
+
|
127 |
+
# JB 18-03-2024:
|
128 |
+
# https://python.langchain.com/docs/integrations/document_loaders/
|
129 |
+
# MICROSOFT WORD:
|
130 |
+
# https://python.langchain.com/docs/integrations/document_loaders/microsoft_word
|
131 |
+
# 1 - Using Docx2txt
|
132 |
+
# Load .docx using Docx2txt into a document.
|
133 |
+
# %pip install --upgrade --quiet docx2txt
|
134 |
+
# from langchain_community.document_loaders import Docx2txtLoader
|
135 |
+
# loader = Docx2txtLoader("example_data/fake.docx")
|
136 |
+
# data = loader.load()
|
137 |
+
# data
|
138 |
+
# [Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]
|
139 |
+
#
|
140 |
+
# 2A - Using Unstructured
|
141 |
+
# from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
142 |
+
# loader = UnstructuredWordDocumentLoader("example_data/fake.docx")
|
143 |
+
# data = loader.load()
|
144 |
+
# data
|
145 |
+
# [Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]
|
146 |
+
#
|
147 |
+
# 2B - Retain Elements
|
148 |
+
# Under the hood, Unstructured creates different “elements” for different chunks of text.
|
149 |
+
# By default we combine those together, but you can easily keep that separation by specifying mode="elements".
|
150 |
+
# loader = UnstructuredWordDocumentLoader("example_data/fake.docx", mode="elements")
|
151 |
+
# data = loader.load()
|
152 |
+
# data[0]
|
153 |
+
# Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)
|
154 |
+
#
|
155 |
+
# 2A - Using Unstructured
|
156 |
+
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
|
157 |
+
loader = UnstructuredWordDocumentLoader(path, glob="**/*.docx")
|
158 |
+
docs = loader.load()
|
159 |
+
st.session_state.docs = docs
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
|
164 |
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
165 |
st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
|