arjun-mavonic commited on
Commit
bb83cd5
0 Parent(s):

Initial commit

Browse files
Files changed (7) hide show
  1. .gitignore +11 -0
  2. .vscode/settings.json +4 -0
  3. LICENSE +21 -0
  4. README.md +63 -0
  5. main.py +66 -0
  6. requirement.txt +3 -0
  7. utils.py +73 -0
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ uploads
2
+ converted_docx
3
+ extracted_images
4
+
5
+ # env
6
+ venv
7
+
8
+
9
+ # misc
10
+ __pycache__
11
+ .DS_Store
.vscode/settings.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "editor.defaultFormatter": "ms-python.black-formatter",
3
+ "editor.formatOnSave": true
4
+ }
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Arjun Nayak
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF to Docx Converter (non readable pdf, scanned document pdfs)
2
+
3
+ This is a Python application that converts non-readable PDF files, such as scanned documents, into readable Word documents. It achieves this by first converting the PDF files into images and then extracting the text from the images to create the Word documents. The application provides a user-friendly interface where you can upload PDF files, select the files you want to convert, and initiate the conversion process. Once the conversion is completed, the converted Word documents can be downloaded from the application.
4
+
5
+ ## Installation
6
+
7
+ 1. Clone the repository:
8
+
9
+ ```shell
10
+ git clone https://github.com/arjun-mavonic/scanned-pdf-text-extracter.git
11
+ ```
12
+
13
+ 2. Navigate to the project directory:
14
+
15
+ ```shell
16
+ cd scanned-pdf-text-extracter
17
+ ```
18
+
19
+ 3. Create a virtual environment:
20
+
21
+ ```shell
22
+ python -m venv venv
23
+ ```
24
+
25
+ 4. Activate the virtual environment:
26
+
27
+ - For Windows:
28
+
29
+ ```shell
30
+ venv\Scripts\activate
31
+ ```
32
+
33
+ - For macOS/Linux:
34
+
35
+ ```shell
36
+ source venv/bin/activate
37
+ ```
38
+
39
+ 5. Install the dependencies:
40
+
41
+ ```shell
42
+ pip install -r requirements.txt
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ 1. Run the application:
48
+
49
+ ```shell
50
+ streamlit run main.py
51
+ ```
52
+
53
+ 2. Upload a PDF file using the file uploader component.
54
+
55
+ 3. Select the PDF files you want to convert to images by checking the checkboxes.
56
+
57
+ 4. Click the "Convert" button to start the conversion process.
58
+
59
+ 5. Once the conversion is completed, the converted Word documents will be available for download in the right column.
60
+
61
+ ## License
62
+
63
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more information.
main.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pdf2image import convert_from_path
3
+ from docx import Document
4
+ import streamlit as st
5
+ from utils import create_or_empty_dir, convert_pdf_to_images, create_docx_with_text
6
+
7
+ extracted_images_dir = 'extracted_images'
8
+ # Get the current directory
9
+ current_dir = os.path.dirname(os.path.abspath(__file__))
10
+ uploads_dir = os.path.join(current_dir, 'uploads')
11
+ os.makedirs(uploads_dir, exist_ok=True)
12
+ converted_docx_dir = os.path.join(current_dir, 'converted_docx')
13
+ os.makedirs(converted_docx_dir, exist_ok=True)
14
+
15
+ # Create a file uploader component
16
+ uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
17
+
18
+ # Check if a file was uploaded
19
+ if uploaded_file is not None:
20
+ # Save the uploaded file to the uploads directory
21
+ with open(os.path.join(uploads_dir, uploaded_file.name), "wb") as file:
22
+ file.write(uploaded_file.getbuffer())
23
+ st.success("File uploaded successfully!")
24
+ else:
25
+ st.info("Please upload a PDF file.")
26
+
27
+ # Get a list of all PDF files in the uploads directory
28
+ pdf_files = [file for file in os.listdir(uploads_dir) if file.endswith(".pdf")]
29
+
30
+ # Create a column layout
31
+ col1, col2 = st.columns(2)
32
+
33
+ # Show checkboxes for each PDF file in col1
34
+ with col1:
35
+ selected_files = []
36
+ for file in pdf_files:
37
+ checkbox = st.checkbox(file)
38
+ if checkbox:
39
+ selected_files.append(file)
40
+
41
+ # Check if any files are selected
42
+ if selected_files:
43
+ # Create a button to trigger the conversion process
44
+ if st.button("Convert"):
45
+ # Create or empty the extracted_images directory
46
+ print(f'Creating or emptying the {extracted_images_dir} directory')
47
+ create_or_empty_dir(extracted_images_dir)
48
+
49
+ # Convert selected PDF files to images
50
+ for file in selected_files:
51
+ pdf_path = os.path.join(uploads_dir, file)
52
+ print(f'Converting {file} to images in {extracted_images_dir}')
53
+ convert_pdf_to_images(pdf_path, extracted_images_dir)
54
+ # Create a Word document with text extracted from images
55
+ output_docx = os.path.join(converted_docx_dir, f'{file.replace(".pdf", "")}.docx')
56
+ image_folder = os.path.join(current_dir, extracted_images_dir)
57
+ print(f'Creating {f'{file.replace(".pdf", "")}.docx'} with text extracted from images in {extracted_images_dir}')
58
+ create_docx_with_text(image_folder, output_docx)
59
+
60
+ st.success("Conversion completed successfully!")
61
+
62
+ # Show documents from the converted_docx folder in col2
63
+ with col2:
64
+ docx_files = [file for file in os.listdir(converted_docx_dir) if file.endswith(".docx")]
65
+ for file in docx_files:
66
+ st.download_button(f"Download {file}", open(os.path.join(converted_docx_dir, file), "rb").read(), file_name=file, mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document")
requirement.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pdf2image==1.17.0
2
+ pytesseract==0.3.10
3
+ python-docx==1.1.2
utils.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pdf2image import convert_from_path
3
+ from PIL import Image
4
+ import pytesseract
5
+ from docx import Document
6
+
7
+
8
+ def create_or_empty_dir(directory):
9
+ """
10
+ Create or empty the specified directory.
11
+
12
+ Args:
13
+ directory (str): The directory path.
14
+ """
15
+ if os.path.exists(directory):
16
+ # Empty the directory if it already exists
17
+ for filename in os.listdir(directory):
18
+ file_path = os.path.join(directory, filename)
19
+ os.remove(file_path)
20
+ else:
21
+ # Create the directory if it doesn't exist
22
+ os.makedirs(directory)
23
+
24
+
25
+ def convert_pdf_to_images(input_pdf, output_dir):
26
+ """
27
+ Convert a PDF file to a series of images.
28
+
29
+ Args:
30
+ input_pdf (str): The path to the input PDF file.
31
+ output_dir (str): The directory to save the converted images.
32
+ """
33
+ pages = convert_from_path(input_pdf)
34
+
35
+ # Save each page as a JPEG file using Pillow
36
+ for i, page in enumerate(pages):
37
+ image_path = os.path.join(output_dir, f"page_{i}.jpg")
38
+ page.save(image_path, "JPEG")
39
+
40
+
41
+ def extract_text_from_image(image_path):
42
+ """
43
+ Extract text from an image using OCR (Optical Character Recognition).
44
+
45
+ Args:
46
+ image_path (str): The path to the input image file.
47
+
48
+ Returns:
49
+ str: The extracted text from the image.
50
+ """
51
+ image = Image.open(image_path)
52
+ text = pytesseract.image_to_string(image)
53
+ return text
54
+
55
+
56
+ def create_docx_with_text(image_folder, output_docx):
57
+ """
58
+ Create a Word document (.docx) with text extracted from images.
59
+
60
+ Args:
61
+ image_folder (str): The directory containing the input images.
62
+ output_docx (str): The path to save the output Word document.
63
+ """
64
+ document = Document()
65
+ for filename in sorted(
66
+ os.listdir(image_folder), key=lambda x: int(x.split("_")[1].split(".")[0])
67
+ ):
68
+ if filename.endswith(".png") or filename.endswith(".jpg"):
69
+ image_path = os.path.join(image_folder, filename)
70
+ text = extract_text_from_image(image_path)
71
+ text = text.encode("utf-8", "ignore").decode("latin-1", "ignore")
72
+ document.add_paragraph(text)
73
+ document.save(output_docx)