Kate0816 commited on
Commit
bb875a8
·
1 Parent(s): cfd99fa

Update modules/index_func.py

Browse files
Files changed (1) hide show
  1. modules/index_func.py +5 -2
modules/index_func.py CHANGED
@@ -36,9 +36,12 @@ def get_documents(file_src):
36
  except:
37
  pdftext = ""
38
  with open(filepath, "rb") as pdfFileObj:
39
- pdfReader = PyPDF2.PdfReader(pdfFileObj)
40
  for page in tqdm(pdfReader.pages):
41
- pdftext += page.extract_text()
 
 
 
42
  texts = [Document(page_content=pdftext,
43
  metadata={"source": filepath})]
44
  elif file_type == ".docx":
 
36
  except:
37
  pdftext = ""
38
  with open(filepath, "rb") as pdfFileObj:
39
+ pdfReader = PyPDF2.PdfReader(pdfFileObj) #pdfReader.pages 有多少頁
40
  for page in tqdm(pdfReader.pages):
41
+ pdftmp = page.extract_text() #每頁的文字加起來
42
+ logging.info(f"pdftmp:{pdftmp}")
43
+ pdftext += pdftmp
44
+
45
  texts = [Document(page_content=pdftext,
46
  metadata={"source": filepath})]
47
  elif file_type == ".docx":