arjun-mavonic
commited on
Commit
•
bb83cd5
0
Parent(s):
Initial commit
Browse files- .gitignore +11 -0
- .vscode/settings.json +4 -0
- LICENSE +21 -0
- README.md +63 -0
- main.py +66 -0
- requirement.txt +3 -0
- utils.py +73 -0
.gitignore
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
uploads
|
2 |
+
converted_docx
|
3 |
+
extracted_images
|
4 |
+
|
5 |
+
# env
|
6 |
+
venv
|
7 |
+
|
8 |
+
|
9 |
+
# misc
|
10 |
+
__pycache__
|
11 |
+
.DS_Store
|
.vscode/settings.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"editor.defaultFormatter": "ms-python.black-formatter",
|
3 |
+
"editor.formatOnSave": true
|
4 |
+
}
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Arjun Nayak
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PDF to Docx Converter (non readable pdf, scanned document pdfs)
|
2 |
+
|
3 |
+
This is a Python application that converts non-readable PDF files, such as scanned documents, into readable Word documents. It achieves this by first converting the PDF files into images and then extracting the text from the images to create the Word documents. The application provides a user-friendly interface where you can upload PDF files, select the files you want to convert, and initiate the conversion process. Once the conversion is completed, the converted Word documents can be downloaded from the application.
|
4 |
+
|
5 |
+
## Installation
|
6 |
+
|
7 |
+
1. Clone the repository:
|
8 |
+
|
9 |
+
```shell
|
10 |
+
git clone https://github.com/arjun-mavonic/scanned-pdf-text-extracter.git
|
11 |
+
```
|
12 |
+
|
13 |
+
2. Navigate to the project directory:
|
14 |
+
|
15 |
+
```shell
|
16 |
+
cd scanned-pdf-text-extracter
|
17 |
+
```
|
18 |
+
|
19 |
+
3. Create a virtual environment:
|
20 |
+
|
21 |
+
```shell
|
22 |
+
python -m venv venv
|
23 |
+
```
|
24 |
+
|
25 |
+
4. Activate the virtual environment:
|
26 |
+
|
27 |
+
- For Windows:
|
28 |
+
|
29 |
+
```shell
|
30 |
+
venv\Scripts\activate
|
31 |
+
```
|
32 |
+
|
33 |
+
- For macOS/Linux:
|
34 |
+
|
35 |
+
```shell
|
36 |
+
source venv/bin/activate
|
37 |
+
```
|
38 |
+
|
39 |
+
5. Install the dependencies:
|
40 |
+
|
41 |
+
```shell
|
42 |
+
pip install -r requirements.txt
|
43 |
+
```
|
44 |
+
|
45 |
+
## Usage
|
46 |
+
|
47 |
+
1. Run the application:
|
48 |
+
|
49 |
+
```shell
|
50 |
+
streamlit run main.py
|
51 |
+
```
|
52 |
+
|
53 |
+
2. Upload a PDF file using the file uploader component.
|
54 |
+
|
55 |
+
3. Select the PDF files you want to convert to images by checking the checkboxes.
|
56 |
+
|
57 |
+
4. Click the "Convert" button to start the conversion process.
|
58 |
+
|
59 |
+
5. Once the conversion is completed, the converted Word documents will be available for download in the right column.
|
60 |
+
|
61 |
+
## License
|
62 |
+
|
63 |
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more information.
|
main.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pdf2image import convert_from_path
|
3 |
+
from docx import Document
|
4 |
+
import streamlit as st
|
5 |
+
from utils import create_or_empty_dir, convert_pdf_to_images, create_docx_with_text
|
6 |
+
|
7 |
+
extracted_images_dir = 'extracted_images'
|
8 |
+
# Get the current directory
|
9 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
uploads_dir = os.path.join(current_dir, 'uploads')
|
11 |
+
os.makedirs(uploads_dir, exist_ok=True)
|
12 |
+
converted_docx_dir = os.path.join(current_dir, 'converted_docx')
|
13 |
+
os.makedirs(converted_docx_dir, exist_ok=True)
|
14 |
+
|
15 |
+
# Create a file uploader component
|
16 |
+
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
|
17 |
+
|
18 |
+
# Check if a file was uploaded
|
19 |
+
if uploaded_file is not None:
|
20 |
+
# Save the uploaded file to the uploads directory
|
21 |
+
with open(os.path.join(uploads_dir, uploaded_file.name), "wb") as file:
|
22 |
+
file.write(uploaded_file.getbuffer())
|
23 |
+
st.success("File uploaded successfully!")
|
24 |
+
else:
|
25 |
+
st.info("Please upload a PDF file.")
|
26 |
+
|
27 |
+
# Get a list of all PDF files in the uploads directory
|
28 |
+
pdf_files = [file for file in os.listdir(uploads_dir) if file.endswith(".pdf")]
|
29 |
+
|
30 |
+
# Create a column layout
|
31 |
+
col1, col2 = st.columns(2)
|
32 |
+
|
33 |
+
# Show checkboxes for each PDF file in col1
|
34 |
+
with col1:
|
35 |
+
selected_files = []
|
36 |
+
for file in pdf_files:
|
37 |
+
checkbox = st.checkbox(file)
|
38 |
+
if checkbox:
|
39 |
+
selected_files.append(file)
|
40 |
+
|
41 |
+
# Check if any files are selected
|
42 |
+
if selected_files:
|
43 |
+
# Create a button to trigger the conversion process
|
44 |
+
if st.button("Convert"):
|
45 |
+
# Create or empty the extracted_images directory
|
46 |
+
print(f'Creating or emptying the {extracted_images_dir} directory')
|
47 |
+
create_or_empty_dir(extracted_images_dir)
|
48 |
+
|
49 |
+
# Convert selected PDF files to images
|
50 |
+
for file in selected_files:
|
51 |
+
pdf_path = os.path.join(uploads_dir, file)
|
52 |
+
print(f'Converting {file} to images in {extracted_images_dir}')
|
53 |
+
convert_pdf_to_images(pdf_path, extracted_images_dir)
|
54 |
+
# Create a Word document with text extracted from images
|
55 |
+
output_docx = os.path.join(converted_docx_dir, f'{file.replace(".pdf", "")}.docx')
|
56 |
+
image_folder = os.path.join(current_dir, extracted_images_dir)
|
57 |
+
print(f'Creating {f'{file.replace(".pdf", "")}.docx'} with text extracted from images in {extracted_images_dir}')
|
58 |
+
create_docx_with_text(image_folder, output_docx)
|
59 |
+
|
60 |
+
st.success("Conversion completed successfully!")
|
61 |
+
|
62 |
+
# Show documents from the converted_docx folder in col2
|
63 |
+
with col2:
|
64 |
+
docx_files = [file for file in os.listdir(converted_docx_dir) if file.endswith(".docx")]
|
65 |
+
for file in docx_files:
|
66 |
+
st.download_button(f"Download {file}", open(os.path.join(converted_docx_dir, file), "rb").read(), file_name=file, mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
requirement.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
pdf2image==1.17.0
|
2 |
+
pytesseract==0.3.10
|
3 |
+
python-docx==1.1.2
|
utils.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pdf2image import convert_from_path
|
3 |
+
from PIL import Image
|
4 |
+
import pytesseract
|
5 |
+
from docx import Document
|
6 |
+
|
7 |
+
|
8 |
+
def create_or_empty_dir(directory):
|
9 |
+
"""
|
10 |
+
Create or empty the specified directory.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
directory (str): The directory path.
|
14 |
+
"""
|
15 |
+
if os.path.exists(directory):
|
16 |
+
# Empty the directory if it already exists
|
17 |
+
for filename in os.listdir(directory):
|
18 |
+
file_path = os.path.join(directory, filename)
|
19 |
+
os.remove(file_path)
|
20 |
+
else:
|
21 |
+
# Create the directory if it doesn't exist
|
22 |
+
os.makedirs(directory)
|
23 |
+
|
24 |
+
|
25 |
+
def convert_pdf_to_images(input_pdf, output_dir):
|
26 |
+
"""
|
27 |
+
Convert a PDF file to a series of images.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
input_pdf (str): The path to the input PDF file.
|
31 |
+
output_dir (str): The directory to save the converted images.
|
32 |
+
"""
|
33 |
+
pages = convert_from_path(input_pdf)
|
34 |
+
|
35 |
+
# Save each page as a JPEG file using Pillow
|
36 |
+
for i, page in enumerate(pages):
|
37 |
+
image_path = os.path.join(output_dir, f"page_{i}.jpg")
|
38 |
+
page.save(image_path, "JPEG")
|
39 |
+
|
40 |
+
|
41 |
+
def extract_text_from_image(image_path):
|
42 |
+
"""
|
43 |
+
Extract text from an image using OCR (Optical Character Recognition).
|
44 |
+
|
45 |
+
Args:
|
46 |
+
image_path (str): The path to the input image file.
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
str: The extracted text from the image.
|
50 |
+
"""
|
51 |
+
image = Image.open(image_path)
|
52 |
+
text = pytesseract.image_to_string(image)
|
53 |
+
return text
|
54 |
+
|
55 |
+
|
56 |
+
def create_docx_with_text(image_folder, output_docx):
|
57 |
+
"""
|
58 |
+
Create a Word document (.docx) with text extracted from images.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
image_folder (str): The directory containing the input images.
|
62 |
+
output_docx (str): The path to save the output Word document.
|
63 |
+
"""
|
64 |
+
document = Document()
|
65 |
+
for filename in sorted(
|
66 |
+
os.listdir(image_folder), key=lambda x: int(x.split("_")[1].split(".")[0])
|
67 |
+
):
|
68 |
+
if filename.endswith(".png") or filename.endswith(".jpg"):
|
69 |
+
image_path = os.path.join(image_folder, filename)
|
70 |
+
text = extract_text_from_image(image_path)
|
71 |
+
text = text.encode("utf-8", "ignore").decode("latin-1", "ignore")
|
72 |
+
document.add_paragraph(text)
|
73 |
+
document.save(output_docx)
|