Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -50,14 +50,25 @@ def answer_question_from_pdf(pdf_text, question):
|
|
50 |
|
51 |
# Function to extract text from PDF
|
52 |
def extract_text_from_pdf(pdf_file):
|
53 |
-
|
54 |
-
|
|
|
55 |
pdf_arr = []
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
pdf_arr.append(pdf_text)
|
|
|
59 |
return pdf_arr
|
60 |
-
|
61 |
# Streamlit app
|
62 |
st.title("PDF Explorer")
|
63 |
|
|
|
50 |
|
51 |
# Function to extract text from PDF
|
52 |
def extract_text_from_pdf(pdf_file):
|
53 |
+
# Open the PDF file
|
54 |
+
pdf_document = fitz.open(pdf_file)
|
55 |
+
|
56 |
pdf_arr = []
|
57 |
+
|
58 |
+
# Iterate through each page
|
59 |
+
for page_num in range(len(pdf_document)):
|
60 |
+
# Get the page
|
61 |
+
page = pdf_document.load_page(page_num)
|
62 |
+
|
63 |
+
# Get the page as an image
|
64 |
+
pix = page.get_pixmap()
|
65 |
+
img = Image.open(io.BytesIO(pix.tobytes()))
|
66 |
+
|
67 |
+
# Perform OCR on the image
|
68 |
+
pdf_text = pytesseract.image_to_string(img)
|
69 |
pdf_arr.append(pdf_text)
|
70 |
+
|
71 |
return pdf_arr
|
|
|
72 |
# Streamlit app
|
73 |
st.title("PDF Explorer")
|
74 |
|