JBHF commited on
Commit
60185cb
1 Parent(s): 6daff1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -2
app.py CHANGED
@@ -55,8 +55,10 @@ st.write("---------------------------------")
55
 
56
  st.write("LIST OF ALL THE LOADED DOCUMENTS: ")
57
  st.write("")
58
- pdf_files = glob.glob("*.pdf")
59
- for file in pdf_files:
 
 
60
  st.write(file)
61
 
62
  st.write("---------------------------------")
@@ -121,6 +123,43 @@ if "vector" not in st.session_state:
121
  loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
122
  docs = loader.load()
123
  st.session_state.docs = docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
126
  st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)
 
55
 
56
  st.write("LIST OF ALL THE LOADED DOCUMENTS: ")
57
  st.write("")
58
+ # pdf_files = glob.glob("*.pdf")
59
+ word_files = glob.glob("*.docx")
60
+ # for file in pdf_files:
61
+ for file in word_files:
62
  st.write(file)
63
 
64
  st.write("---------------------------------")
 
123
  loader = PyPDFDirectoryLoader(path, glob="**/*.pdf")
124
  docs = loader.load()
125
  st.session_state.docs = docs
126
+
127
+ # JB 18-03-2024:
128
+ # https://python.langchain.com/docs/integrations/document_loaders/
129
+ # MICROSOFT WORD:
130
+ # https://python.langchain.com/docs/integrations/document_loaders/microsoft_word
131
+ # 1 - Using Docx2txt
132
+ # Load .docx using Docx2txt into a document.
133
+ # %pip install --upgrade --quiet docx2txt
134
+ # from langchain_community.document_loaders import Docx2txtLoader
135
+ # loader = Docx2txtLoader("example_data/fake.docx")
136
+ # data = loader.load()
137
+ # data
138
+ # [Document(page_content='Lorem ipsum dolor sit amet.', metadata={'source': 'example_data/fake.docx'})]
139
+ #
140
+ # 2A - Using Unstructured
141
+ # from langchain_community.document_loaders import UnstructuredWordDocumentLoader
142
+ # loader = UnstructuredWordDocumentLoader("example_data/fake.docx")
143
+ # data = loader.load()
144
+ # data
145
+ # [Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx'}, lookup_index=0)]
146
+ #
147
+ # 2B - Retain Elements
148
+ # Under the hood, Unstructured creates different “elements” for different chunks of text.
149
+ # By default we combine those together, but you can easily keep that separation by specifying mode="elements".
150
+ # loader = UnstructuredWordDocumentLoader("example_data/fake.docx", mode="elements")
151
+ # data = loader.load()
152
+ # data[0]
153
+ # Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'fake.docx', 'filename': 'fake.docx', 'category': 'Title'}, lookup_index=0)
154
+ #
155
+ # 2A - Using Unstructured
156
+ from langchain_community.document_loaders import UnstructuredWordDocumentLoader
157
+ loader = UnstructuredWordDocumentLoader(path, glob="**/*.docx")
158
+ docs = loader.load()
159
+ st.session_state.docs = docs
160
+
161
+
162
+
163
 
164
  st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
165
  st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs)