ChinmayBH commited on
Commit
503c3e1
1 Parent(s): 8199bf8

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +376 -0
  2. requirements.txt +53 -0
app.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import fitz
4
+ import pdfplumber
5
+ import pandas as pd
6
+ import streamlit as st
7
+ from tempfile import NamedTemporaryFile
8
+ from PIL import Image
9
+ import io
10
+
11
+ def extract_text_images(
12
+ pdf_path: str, output_folder: str,
13
+ minimum_font_size: int,
14
+ extract_text: bool = True,
15
+ extract_images: bool = True,
16
+ mode: str = 'headerwise',
17
+ header_font_sizes: list[float] = None,
18
+ tolerance: float = 0.01,
19
+ ) -> dict:
20
+ """
21
+ Extracts text and/or images from a PDF and organizes them either by headers or by pages.
22
+
23
+ Params
24
+ -------
25
+ pdf_path: str
26
+ Path to the input PDF file.
27
+ output_folder: str
28
+ Path to the output folder where extracted data will be saved.
29
+ extract_text: bool
30
+ Whether to extract text.
31
+ extract_images: bool
32
+ Whether to extract images.
33
+ mode: str
34
+ Extraction mode, either 'headerwise' or 'pagewise'.
35
+ header_font_sizes: list[float]
36
+ List of font sizes to be considered as headers.
37
+ tolerance: float
38
+ Tolerance for font size comparison.
39
+
40
+ Returns
41
+ -------
42
+ dict
43
+ Dictionary containing extracted text and/or image data.
44
+ """
45
+ if not os.path.exists(output_folder):
46
+ os.makedirs(output_folder)
47
+
48
+ extraction_data = []
49
+ current_header = None
50
+ current_header_content = []
51
+
52
+ def add_current_header_content() -> None:
53
+ """
54
+ Adds the current header and its content to the extraction data.
55
+ """
56
+ nonlocal current_header, current_header_content
57
+ if current_header:
58
+ extraction_data.append({
59
+ 'header': current_header,
60
+ 'content': current_header_content
61
+ })
62
+ current_header_content = []
63
+ current_header = None
64
+
65
+ def is_header_font_size(font_size: float) -> bool:
66
+ """
67
+ Checks if a given font size matches any of the header font sizes.
68
+ """
69
+ return any(
70
+ abs(font_size - header_font_size) <= tolerance
71
+ for header_font_size in header_font_sizes
72
+ )
73
+
74
+ pdf_document = fitz.open(pdf_path)
75
+
76
+ for page_number in range(pdf_document.page_count):
77
+ page = pdf_document.load_page(page_number)
78
+ elements = []
79
+
80
+ if extract_text:
81
+ # Extract text blocks with their positions and font sizes
82
+ text_blocks = page.get_text("dict")["blocks"]
83
+ lines = {}
84
+
85
+ # Group text blocks by their vertical position (top) to form lines
86
+ for block in text_blocks:
87
+ if block["type"] == 0: # Text block
88
+ for line in block["lines"]:
89
+ for span in line["spans"]:
90
+ font_size = span["size"]
91
+ top = span["bbox"][1]
92
+
93
+ # Skip text blocks with font size less than 10
94
+ if font_size < minimum_font_size:
95
+ continue
96
+
97
+ if top not in lines:
98
+ lines[top] = []
99
+ lines[top].append(span)
100
+
101
+ # Process each line to check if it's a header
102
+ for top in sorted(lines.keys()):
103
+ line = lines[top]
104
+ line_text = " ".join([span['text'] for span in line])
105
+ line_font_size = line[0]['size']
106
+
107
+ elements.append({
108
+ 'type': 'text',
109
+ 'font_size': line_font_size,
110
+ 'page': page_number + 1,
111
+ 'content': line_text,
112
+ 'x0': line[0]['bbox'][0],
113
+ 'top': top
114
+ })
115
+
116
+ if extract_images:
117
+ # Extract images using PyMuPDF
118
+ image_list = page.get_images(full=True)
119
+
120
+ for img_index, img in enumerate(image_list):
121
+ xref = img[0]
122
+ base_image = pdf_document.extract_image(xref)
123
+ image_bytes = base_image["image"]
124
+ image_filename = os.path.join(
125
+ output_folder,
126
+ f"page_{page_number + 1}_img_{img_index + 1}.png"
127
+ )
128
+
129
+ with open(image_filename, "wb") as img_file:
130
+ img_file.write(image_bytes)
131
+
132
+ # Get the position of the image
133
+ img_rect = page.get_image_bbox(img)
134
+ elements.append({
135
+ 'type': 'image',
136
+ 'page': page_number + 1,
137
+ 'path': image_filename,
138
+ 'x0': img_rect.x0,
139
+ 'top': img_rect.y0
140
+ })
141
+
142
+ # Sort elements by their vertical position (top) first,
143
+ # and then by horizontal position (x0)
144
+ elements.sort(key=lambda e: (e['top'], e['x0']))
145
+
146
+ if mode == 'headerwise':
147
+ # Process elements to extract headers and content
148
+ for element in elements:
149
+ if element['type'] == 'text' and \
150
+ is_header_font_size(element['font_size']):
151
+ # If a new header is found,
152
+ # finalize the current header content
153
+ add_current_header_content()
154
+ current_header = element['content']
155
+ elif element['type'] == 'text':
156
+ if current_header_content and \
157
+ current_header_content[-1]['type'] == 'text':
158
+ current_header_content[-1]['content'] \
159
+ += " " + element['content']
160
+ else:
161
+ current_header_content.append({
162
+ 'type': 'text',
163
+ 'content': element['content']
164
+ })
165
+ elif element['type'] == 'image':
166
+ current_header_content.append({
167
+ 'type': 'image',
168
+ 'path': element['path']
169
+ })
170
+
171
+ elif mode == 'pagewise':
172
+ page_content = []
173
+ for element in elements:
174
+ if element['type'] == 'text':
175
+ if page_content and \
176
+ page_content[-1]['type'] == 'text':
177
+ page_content[-1]['content'] \
178
+ += " " + element['content']
179
+ else:
180
+ page_content.append({
181
+ 'type': 'text',
182
+ 'content': element['content']
183
+ })
184
+ elif element['type'] == 'image':
185
+ page_content.append({
186
+ 'type': 'image',
187
+ 'path': element['path']
188
+ })
189
+ extraction_data.append({
190
+ 'page': page_number + 1,
191
+ 'content': page_content
192
+ })
193
+
194
+ # After the loop, finalize any remaining header content
195
+ if mode == 'headerwise':
196
+ add_current_header_content()
197
+
198
+ pdf_document.close()
199
+
200
+ return extraction_data
201
+
202
+ def get_word_font_sizes(pdf_path, target_words):
203
+ word_font_sizes = {word: [] for word in target_words}
204
+
205
+ with pdfplumber.open(pdf_path) as pdf:
206
+ for page in pdf.pages:
207
+ words = page.extract_words(extra_attrs=['fontname', 'size'])
208
+ for word in words:
209
+ text = word['text'].strip()
210
+ if text in target_words:
211
+ word_font_sizes[text].append(word['size'])
212
+ return word_font_sizes
213
+
214
+ def preview_pdf(pdf_path, num_pages=1):
215
+ pdf_document = fitz.open(pdf_path)
216
+ preview_images = []
217
+
218
+ for page_number in range(min(num_pages, pdf_document.page_count)):
219
+ page = pdf_document.load_page(page_number)
220
+ pix = page.get_pixmap()
221
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
222
+ preview_images.append(img)
223
+
224
+ pdf_document.close()
225
+ return preview_images
226
+
227
+ # Streamlit UI
228
+
229
+ import io
230
+
231
+ def main():
232
+ # setting page config
233
+ st.set_page_config(
234
+ page_title="Object counting",
235
+ page_icon="🧊",
236
+ layout="wide",
237
+ initial_sidebar_state="expanded",
238
+ menu_items={
239
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
240
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
241
+ }
242
+ )
243
+
244
+ st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER</h1>",
245
+ unsafe_allow_html=True)
246
+ st.markdown(
247
+ "<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>",
248
+ unsafe_allow_html=True
249
+ )
250
+ st.markdown(
251
+ "<h5 style='text-align: center;color: red;'>Step 1: Upload pdf </h5>",
252
+ unsafe_allow_html=True
253
+ )
254
+ st.markdown(
255
+ "<h5 style='text-align: center;color: red;'>Step 2: Fill the values at right in data extraction settings </h5>",
256
+ unsafe_allow_html=True
257
+ )
258
+ st.markdown(
259
+ "<h5 style='text-align: center;color: red;'>Step 3: Download the data in desired format </h5>",
260
+ unsafe_allow_html=True
261
+ )
262
+
263
+ uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
264
+ if uploaded_pdf:
265
+ # Save the uploaded PDF to a temporary file
266
+ with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
267
+ temp_pdf.write(uploaded_pdf.read())
268
+ temp_pdf_path = temp_pdf.name
269
+
270
+
271
+ # Collapsible PDF Preview
272
+ with st.expander("PDF Preview", expanded=True):
273
+ num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1)
274
+ preview_images = preview_pdf(temp_pdf_path, num_pages)
275
+
276
+ for img in preview_images:
277
+ st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True)
278
+
279
+ st.sidebar.title("DATA EXTRACTION SETTINGS")
280
+ st.sidebar.write("How you want to extract data?")
281
+
282
+ extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"])
283
+ # Font Size Detection
284
+ st.sidebar.title("FONT SIZE DETECTION")
285
+ st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below")
286
+ target_words_input = st.sidebar.text_input(
287
+ "Target words (comma-separated)", "")
288
+ target_words = [word.strip() for word in target_words_input.split(",")]
289
+
290
+ if st.sidebar.button("Get Font Sizes"):
291
+ word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words)
292
+ for word, sizes in word_font_sizes.items():
293
+ st.sidebar.write(f"Word: {word}, Font sizes: {sizes}")
294
+
295
+ # st.sidebar.warning("Fill below required details")
296
+ header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0")
297
+ # st.sidebar.info("Header sizes are only required in case of headerwise extraction")
298
+ header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")]
299
+ st.sidebar.title("OUTPUT FOLDER PATH")
300
+ output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data")))
301
+ st.sidebar.info("what do you want to include in data extraction?")
302
+ extract_text = st.sidebar.checkbox("Extract Text", value=True)
303
+ extract_images = st.sidebar.checkbox("Extract Images", value=True)
304
+
305
+ minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10)
306
+
307
+
308
+ if st.sidebar.button("Start Extraction"):
309
+ if not os.path.exists(output_folder):
310
+ os.makedirs(output_folder)
311
+
312
+ extracted_data = extract_text_images(
313
+ temp_pdf_path,
314
+ output_folder,
315
+ minimum_font_size=minimum_font_size,
316
+ extract_text=extract_text,
317
+ extract_images=extract_images,
318
+ mode=extraction_mode,
319
+ header_font_sizes=header_font_sizes
320
+ )
321
+
322
+ # Display extracted data as JSON
323
+ st.json(extracted_data)
324
+
325
+ # Convert extracted data to a pandas DataFrame
326
+ def extract_to_dataframe(data):
327
+ rows = []
328
+ for item in data:
329
+ if 'header' in item:
330
+ header = item['header']
331
+ for content_item in item['content']:
332
+ if content_item['type'] == 'text':
333
+ rows.append({'Header': header, 'Content': content_item['content']})
334
+ elif content_item['type'] == 'image':
335
+ rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"})
336
+ elif 'page' in item:
337
+ page_num = item['page']
338
+ for content_item in item['content']:
339
+ if content_item['type'] == 'text':
340
+ rows.append({'Page': page_num, 'Content': content_item['content']})
341
+ elif content_item['type'] == 'image':
342
+ rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"})
343
+ return pd.DataFrame(rows)
344
+
345
+ df = extract_to_dataframe(extracted_data)
346
+
347
+ # Save DataFrame to an in-memory BytesIO buffer
348
+ buffer = io.BytesIO()
349
+ with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
350
+ df.to_excel(writer, index=False, sheet_name='Extracted Data')
351
+ buffer.seek(0)
352
+
353
+ # Preview the first 5 lines of the XLSX data
354
+ st.subheader("Preview of Extracted Data (First 5 Lines)")
355
+ preview_df = pd.read_excel(buffer, sheet_name='Extracted Data')
356
+ st.dataframe(preview_df.head())
357
+
358
+ # Provide download options
359
+ st.download_button(
360
+ label="Download JSON",
361
+ data=json.dumps(extracted_data, ensure_ascii=False),
362
+ file_name='extracted_data.json',
363
+ mime='application/json'
364
+ )
365
+
366
+ st.download_button(
367
+ label="Download XLSX",
368
+ data=buffer,
369
+ file_name='extracted_data.xlsx',
370
+ mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
371
+ )
372
+
373
+ st.success("Extraction complete. Data displayed as JSON.")
374
+
375
+ if __name__ == "__main__":
376
+ main()
requirements.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.3.0
2
+ attrs==24.1.0
3
+ blinker==1.8.2
4
+ cachetools==5.4.0
5
+ certifi==2024.7.4
6
+ cffi==1.16.0
7
+ charset-normalizer==3.3.2
8
+ click==8.1.7
9
+ colorama==0.4.6
10
+ cryptography==43.0.0
11
+ et-xmlfile==1.1.0
12
+ gitdb==4.0.11
13
+ GitPython==3.1.43
14
+ idna==3.7
15
+ Jinja2==3.1.4
16
+ jsonschema==4.23.0
17
+ jsonschema-specifications==2023.12.1
18
+ markdown-it-py==3.0.0
19
+ MarkupSafe==2.1.5
20
+ mdurl==0.1.2
21
+ numpy==2.0.1
22
+ openpyxl==3.1.5
23
+ packaging==24.1
24
+ pandas==2.2.2
25
+ pdfminer.six==20231228
26
+ pdfplumber==0.11.2
27
+ pillow==10.4.0
28
+ protobuf==5.27.3
29
+ pyarrow==17.0.0
30
+ pycparser==2.22
31
+ pydeck==0.9.1
32
+ Pygments==2.18.0
33
+ PyMuPDF==1.24.9
34
+ PyMuPDFb==1.24.9
35
+ pypdfium2==4.30.0
36
+ python-dateutil==2.9.0.post0
37
+ pytz==2024.1
38
+ referencing==0.35.1
39
+ requests==2.32.3
40
+ rich==13.7.1
41
+ rpds-py==0.19.1
42
+ six==1.16.0
43
+ smmap==5.0.1
44
+ streamlit==1.37.0
45
+ tenacity==8.5.0
46
+ toml==0.10.2
47
+ toolz==0.12.1
48
+ tornado==6.4.1
49
+ typing_extensions==4.12.2
50
+ tzdata==2024.1
51
+ urllib3==2.2.2
52
+ watchdog==4.0.1
53
+ XlsxWriter==3.2.0