qorgh346 commited on
Commit
dd9ce97
1 Parent(s): 7af777c

update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -8
app.py CHANGED
@@ -10,16 +10,26 @@ from langchain.memory import ConversationBufferMemory
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
13
-
 
14
  def get_pdf_text(pdf_docs):
15
- text = ''
16
  # pdf_file_ = open(pdf_docs,'rb')
17
  # text = "example hofjin"
18
- pdf_reader = PdfReader(pdf_docs)
19
- for page in pdf_reader.pages:
20
- text += page.extract_text()
21
 
22
- return text
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
 
25
  def get_text_chunks(text):
@@ -151,7 +161,7 @@ def main():
151
  if st.button("Process"):
152
  with st.spinner("Processing"):
153
  # get pdf text
154
- raw_text = ""
155
 
156
  for file in docs:
157
  print('file - type : ', file.type)
@@ -160,7 +170,7 @@ def main():
160
  raw_text += get_text_file(file)
161
  elif file.type in ['application/octet-stream', 'application/pdf']:
162
  #file is .pdf
163
- raw_text += get_pdf_text(file)
164
  elif file.type == 'text/csv':
165
  #file is .csv
166
  raw_text += get_csv_file(file)
 
10
  from langchain.chains import ConversationalRetrievalChain
11
  from htmlTemplates import css, bot_template, user_template
12
  from langchain.llms import HuggingFaceHub, LlamaCpp,CTransformers # For loading transformer models.
13
+ from langchain.document_loaders import PyPDFLoader
14
+ from tempfile import NamedTemporaryFile
15
  def get_pdf_text(pdf_docs):
16
+ # text = ''
17
  # pdf_file_ = open(pdf_docs,'rb')
18
  # text = "example hofjin"
 
 
 
19
 
20
+
21
+ # for page in pdf_reader.pages:
22
+ # text += page.extract_text()
23
+
24
+ # return text
25
+ with NamedTemporaryFile() as temp_file:
26
+ temp_file.write(pdf_docs.getvalue())
27
+ temp_file.seek(0)
28
+ pdf_loader = PyPDFLoader(temp_file.name)
29
+ print('pdf_loader = ', pdf_loader)
30
+ pdf_doc = pdf_loader.load()
31
+ print('pdf_doc = ',pdf_doc)
32
+ return pdf_doc
33
 
34
 
35
  def get_text_chunks(text):
 
161
  if st.button("Process"):
162
  with st.spinner("Processing"):
163
  # get pdf text
164
+ doc_list = []
165
 
166
  for file in docs:
167
  print('file - type : ', file.type)
 
170
  raw_text += get_text_file(file)
171
  elif file.type in ['application/octet-stream', 'application/pdf']:
172
  #file is .pdf
173
+ doc_list.append(get_pdf_text(file))
174
  elif file.type == 'text/csv':
175
  #file is .csv
176
  raw_text += get_csv_file(file)