navid72m commited on
Commit
66f696e
1 Parent(s): c762fe4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -5
app.py CHANGED
@@ -50,14 +50,25 @@ def answer_question_from_pdf(pdf_text, question):
50
 
51
  # Function to extract text from PDF
52
  def extract_text_from_pdf(pdf_file):
53
- pdf_reader = PdfReader(pdf_file)
54
- pdf_text = ""
 
55
  pdf_arr = []
56
- for page_num in range(len(pdf_reader.pages)):
57
- pdf_text = pdf_reader.pages[page_num].extract_text()
 
 
 
 
 
 
 
 
 
 
58
  pdf_arr.append(pdf_text)
 
59
  return pdf_arr
60
-
61
  # Streamlit app
62
  st.title("PDF Explorer")
63
 
 
50
 
51
  # Function to extract text from PDF
52
  def extract_text_from_pdf(pdf_file):
53
+ # Open the PDF file
54
+ pdf_document = fitz.open(pdf_file)
55
+
56
  pdf_arr = []
57
+
58
+ # Iterate through each page
59
+ for page_num in range(len(pdf_document)):
60
+ # Get the page
61
+ page = pdf_document.load_page(page_num)
62
+
63
+ # Get the page as an image
64
+ pix = page.get_pixmap()
65
+ img = Image.open(io.BytesIO(pix.tobytes()))
66
+
67
+ # Perform OCR on the image
68
+ pdf_text = pytesseract.image_to_string(img)
69
  pdf_arr.append(pdf_text)
70
+
71
  return pdf_arr
 
72
  # Streamlit app
73
  st.title("PDF Explorer")
74