Upload 2 files
Browse files- app.py +376 -0
- requirements.txt +53 -0
app.py
ADDED
@@ -0,0 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
import fitz
|
4 |
+
import pdfplumber
|
5 |
+
import pandas as pd
|
6 |
+
import streamlit as st
|
7 |
+
from tempfile import NamedTemporaryFile
|
8 |
+
from PIL import Image
|
9 |
+
import io
|
10 |
+
|
11 |
+
def extract_text_images(
|
12 |
+
pdf_path: str, output_folder: str,
|
13 |
+
minimum_font_size: int,
|
14 |
+
extract_text: bool = True,
|
15 |
+
extract_images: bool = True,
|
16 |
+
mode: str = 'headerwise',
|
17 |
+
header_font_sizes: list[float] = None,
|
18 |
+
tolerance: float = 0.01,
|
19 |
+
) -> dict:
|
20 |
+
"""
|
21 |
+
Extracts text and/or images from a PDF and organizes them either by headers or by pages.
|
22 |
+
|
23 |
+
Params
|
24 |
+
-------
|
25 |
+
pdf_path: str
|
26 |
+
Path to the input PDF file.
|
27 |
+
output_folder: str
|
28 |
+
Path to the output folder where extracted data will be saved.
|
29 |
+
extract_text: bool
|
30 |
+
Whether to extract text.
|
31 |
+
extract_images: bool
|
32 |
+
Whether to extract images.
|
33 |
+
mode: str
|
34 |
+
Extraction mode, either 'headerwise' or 'pagewise'.
|
35 |
+
header_font_sizes: list[float]
|
36 |
+
List of font sizes to be considered as headers.
|
37 |
+
tolerance: float
|
38 |
+
Tolerance for font size comparison.
|
39 |
+
|
40 |
+
Returns
|
41 |
+
-------
|
42 |
+
dict
|
43 |
+
Dictionary containing extracted text and/or image data.
|
44 |
+
"""
|
45 |
+
if not os.path.exists(output_folder):
|
46 |
+
os.makedirs(output_folder)
|
47 |
+
|
48 |
+
extraction_data = []
|
49 |
+
current_header = None
|
50 |
+
current_header_content = []
|
51 |
+
|
52 |
+
def add_current_header_content() -> None:
|
53 |
+
"""
|
54 |
+
Adds the current header and its content to the extraction data.
|
55 |
+
"""
|
56 |
+
nonlocal current_header, current_header_content
|
57 |
+
if current_header:
|
58 |
+
extraction_data.append({
|
59 |
+
'header': current_header,
|
60 |
+
'content': current_header_content
|
61 |
+
})
|
62 |
+
current_header_content = []
|
63 |
+
current_header = None
|
64 |
+
|
65 |
+
def is_header_font_size(font_size: float) -> bool:
|
66 |
+
"""
|
67 |
+
Checks if a given font size matches any of the header font sizes.
|
68 |
+
"""
|
69 |
+
return any(
|
70 |
+
abs(font_size - header_font_size) <= tolerance
|
71 |
+
for header_font_size in header_font_sizes
|
72 |
+
)
|
73 |
+
|
74 |
+
pdf_document = fitz.open(pdf_path)
|
75 |
+
|
76 |
+
for page_number in range(pdf_document.page_count):
|
77 |
+
page = pdf_document.load_page(page_number)
|
78 |
+
elements = []
|
79 |
+
|
80 |
+
if extract_text:
|
81 |
+
# Extract text blocks with their positions and font sizes
|
82 |
+
text_blocks = page.get_text("dict")["blocks"]
|
83 |
+
lines = {}
|
84 |
+
|
85 |
+
# Group text blocks by their vertical position (top) to form lines
|
86 |
+
for block in text_blocks:
|
87 |
+
if block["type"] == 0: # Text block
|
88 |
+
for line in block["lines"]:
|
89 |
+
for span in line["spans"]:
|
90 |
+
font_size = span["size"]
|
91 |
+
top = span["bbox"][1]
|
92 |
+
|
93 |
+
# Skip text blocks with font size less than 10
|
94 |
+
if font_size < minimum_font_size:
|
95 |
+
continue
|
96 |
+
|
97 |
+
if top not in lines:
|
98 |
+
lines[top] = []
|
99 |
+
lines[top].append(span)
|
100 |
+
|
101 |
+
# Process each line to check if it's a header
|
102 |
+
for top in sorted(lines.keys()):
|
103 |
+
line = lines[top]
|
104 |
+
line_text = " ".join([span['text'] for span in line])
|
105 |
+
line_font_size = line[0]['size']
|
106 |
+
|
107 |
+
elements.append({
|
108 |
+
'type': 'text',
|
109 |
+
'font_size': line_font_size,
|
110 |
+
'page': page_number + 1,
|
111 |
+
'content': line_text,
|
112 |
+
'x0': line[0]['bbox'][0],
|
113 |
+
'top': top
|
114 |
+
})
|
115 |
+
|
116 |
+
if extract_images:
|
117 |
+
# Extract images using PyMuPDF
|
118 |
+
image_list = page.get_images(full=True)
|
119 |
+
|
120 |
+
for img_index, img in enumerate(image_list):
|
121 |
+
xref = img[0]
|
122 |
+
base_image = pdf_document.extract_image(xref)
|
123 |
+
image_bytes = base_image["image"]
|
124 |
+
image_filename = os.path.join(
|
125 |
+
output_folder,
|
126 |
+
f"page_{page_number + 1}_img_{img_index + 1}.png"
|
127 |
+
)
|
128 |
+
|
129 |
+
with open(image_filename, "wb") as img_file:
|
130 |
+
img_file.write(image_bytes)
|
131 |
+
|
132 |
+
# Get the position of the image
|
133 |
+
img_rect = page.get_image_bbox(img)
|
134 |
+
elements.append({
|
135 |
+
'type': 'image',
|
136 |
+
'page': page_number + 1,
|
137 |
+
'path': image_filename,
|
138 |
+
'x0': img_rect.x0,
|
139 |
+
'top': img_rect.y0
|
140 |
+
})
|
141 |
+
|
142 |
+
# Sort elements by their vertical position (top) first,
|
143 |
+
# and then by horizontal position (x0)
|
144 |
+
elements.sort(key=lambda e: (e['top'], e['x0']))
|
145 |
+
|
146 |
+
if mode == 'headerwise':
|
147 |
+
# Process elements to extract headers and content
|
148 |
+
for element in elements:
|
149 |
+
if element['type'] == 'text' and \
|
150 |
+
is_header_font_size(element['font_size']):
|
151 |
+
# If a new header is found,
|
152 |
+
# finalize the current header content
|
153 |
+
add_current_header_content()
|
154 |
+
current_header = element['content']
|
155 |
+
elif element['type'] == 'text':
|
156 |
+
if current_header_content and \
|
157 |
+
current_header_content[-1]['type'] == 'text':
|
158 |
+
current_header_content[-1]['content'] \
|
159 |
+
+= " " + element['content']
|
160 |
+
else:
|
161 |
+
current_header_content.append({
|
162 |
+
'type': 'text',
|
163 |
+
'content': element['content']
|
164 |
+
})
|
165 |
+
elif element['type'] == 'image':
|
166 |
+
current_header_content.append({
|
167 |
+
'type': 'image',
|
168 |
+
'path': element['path']
|
169 |
+
})
|
170 |
+
|
171 |
+
elif mode == 'pagewise':
|
172 |
+
page_content = []
|
173 |
+
for element in elements:
|
174 |
+
if element['type'] == 'text':
|
175 |
+
if page_content and \
|
176 |
+
page_content[-1]['type'] == 'text':
|
177 |
+
page_content[-1]['content'] \
|
178 |
+
+= " " + element['content']
|
179 |
+
else:
|
180 |
+
page_content.append({
|
181 |
+
'type': 'text',
|
182 |
+
'content': element['content']
|
183 |
+
})
|
184 |
+
elif element['type'] == 'image':
|
185 |
+
page_content.append({
|
186 |
+
'type': 'image',
|
187 |
+
'path': element['path']
|
188 |
+
})
|
189 |
+
extraction_data.append({
|
190 |
+
'page': page_number + 1,
|
191 |
+
'content': page_content
|
192 |
+
})
|
193 |
+
|
194 |
+
# After the loop, finalize any remaining header content
|
195 |
+
if mode == 'headerwise':
|
196 |
+
add_current_header_content()
|
197 |
+
|
198 |
+
pdf_document.close()
|
199 |
+
|
200 |
+
return extraction_data
|
201 |
+
|
202 |
+
def get_word_font_sizes(pdf_path, target_words):
|
203 |
+
word_font_sizes = {word: [] for word in target_words}
|
204 |
+
|
205 |
+
with pdfplumber.open(pdf_path) as pdf:
|
206 |
+
for page in pdf.pages:
|
207 |
+
words = page.extract_words(extra_attrs=['fontname', 'size'])
|
208 |
+
for word in words:
|
209 |
+
text = word['text'].strip()
|
210 |
+
if text in target_words:
|
211 |
+
word_font_sizes[text].append(word['size'])
|
212 |
+
return word_font_sizes
|
213 |
+
|
214 |
+
def preview_pdf(pdf_path, num_pages=1):
|
215 |
+
pdf_document = fitz.open(pdf_path)
|
216 |
+
preview_images = []
|
217 |
+
|
218 |
+
for page_number in range(min(num_pages, pdf_document.page_count)):
|
219 |
+
page = pdf_document.load_page(page_number)
|
220 |
+
pix = page.get_pixmap()
|
221 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
222 |
+
preview_images.append(img)
|
223 |
+
|
224 |
+
pdf_document.close()
|
225 |
+
return preview_images
|
226 |
+
|
227 |
+
# Streamlit UI
|
228 |
+
|
229 |
+
import io
|
230 |
+
|
231 |
+
def main():
|
232 |
+
# setting page config
|
233 |
+
st.set_page_config(
|
234 |
+
page_title="Object counting",
|
235 |
+
page_icon="🧊",
|
236 |
+
layout="wide",
|
237 |
+
initial_sidebar_state="expanded",
|
238 |
+
menu_items={
|
239 |
+
'Get Help': 'https://www.extremelycoolapp.com/help',
|
240 |
+
'Report a bug': "https://www.extremelycoolapp.com/bug",
|
241 |
+
}
|
242 |
+
)
|
243 |
+
|
244 |
+
st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER</h1>",
|
245 |
+
unsafe_allow_html=True)
|
246 |
+
st.markdown(
|
247 |
+
"<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>",
|
248 |
+
unsafe_allow_html=True
|
249 |
+
)
|
250 |
+
st.markdown(
|
251 |
+
"<h5 style='text-align: center;color: red;'>Step 1: Upload pdf </h5>",
|
252 |
+
unsafe_allow_html=True
|
253 |
+
)
|
254 |
+
st.markdown(
|
255 |
+
"<h5 style='text-align: center;color: red;'>Step 2: Fill the values at right in data extraction settings </h5>",
|
256 |
+
unsafe_allow_html=True
|
257 |
+
)
|
258 |
+
st.markdown(
|
259 |
+
"<h5 style='text-align: center;color: red;'>Step 3: Download the data in desired format </h5>",
|
260 |
+
unsafe_allow_html=True
|
261 |
+
)
|
262 |
+
|
263 |
+
uploaded_pdf = st.file_uploader("Upload PDF", type="pdf")
|
264 |
+
if uploaded_pdf:
|
265 |
+
# Save the uploaded PDF to a temporary file
|
266 |
+
with NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
267 |
+
temp_pdf.write(uploaded_pdf.read())
|
268 |
+
temp_pdf_path = temp_pdf.name
|
269 |
+
|
270 |
+
|
271 |
+
# Collapsible PDF Preview
|
272 |
+
with st.expander("PDF Preview", expanded=True):
|
273 |
+
num_pages = st.slider("Number of pages to preview", min_value=1, max_value=5, value=1)
|
274 |
+
preview_images = preview_pdf(temp_pdf_path, num_pages)
|
275 |
+
|
276 |
+
for img in preview_images:
|
277 |
+
st.image(img, caption=f"Page {preview_images.index(img) + 1}", use_column_width=True)
|
278 |
+
|
279 |
+
st.sidebar.title("DATA EXTRACTION SETTINGS")
|
280 |
+
st.sidebar.write("How you want to extract data?")
|
281 |
+
|
282 |
+
extraction_mode = st.sidebar.radio("Extraction Mode", ["headerwise", "pagewise"])
|
283 |
+
# Font Size Detection
|
284 |
+
st.sidebar.title("FONT SIZE DETECTION")
|
285 |
+
st.sidebar.warning("[Only in case of headerwise extraction] if you dont know the font size for your headers or text then copy paste any of those words below")
|
286 |
+
target_words_input = st.sidebar.text_input(
|
287 |
+
"Target words (comma-separated)", "")
|
288 |
+
target_words = [word.strip() for word in target_words_input.split(",")]
|
289 |
+
|
290 |
+
if st.sidebar.button("Get Font Sizes"):
|
291 |
+
word_font_sizes = get_word_font_sizes(temp_pdf_path, target_words)
|
292 |
+
for word, sizes in word_font_sizes.items():
|
293 |
+
st.sidebar.write(f"Word: {word}, Font sizes: {sizes}")
|
294 |
+
|
295 |
+
# st.sidebar.warning("Fill below required details")
|
296 |
+
header_font_sizes = st.sidebar.text_input("Header Font Sizes (comma-separated)", "0")
|
297 |
+
# st.sidebar.info("Header sizes are only required in case of headerwise extraction")
|
298 |
+
header_font_sizes = [float(size.strip()) for size in header_font_sizes.split(",")]
|
299 |
+
st.sidebar.title("OUTPUT FOLDER PATH")
|
300 |
+
output_folder = st.sidebar.text_input(" ", value=os.path.join(os.path.dirname ("Extracted_Data")))
|
301 |
+
st.sidebar.info("what do you want to include in data extraction?")
|
302 |
+
extract_text = st.sidebar.checkbox("Extract Text", value=True)
|
303 |
+
extract_images = st.sidebar.checkbox("Extract Images", value=True)
|
304 |
+
|
305 |
+
minimum_font_size = st.sidebar.number_input("Minimum Font Size", min_value=1, value=10)
|
306 |
+
|
307 |
+
|
308 |
+
if st.sidebar.button("Start Extraction"):
|
309 |
+
if not os.path.exists(output_folder):
|
310 |
+
os.makedirs(output_folder)
|
311 |
+
|
312 |
+
extracted_data = extract_text_images(
|
313 |
+
temp_pdf_path,
|
314 |
+
output_folder,
|
315 |
+
minimum_font_size=minimum_font_size,
|
316 |
+
extract_text=extract_text,
|
317 |
+
extract_images=extract_images,
|
318 |
+
mode=extraction_mode,
|
319 |
+
header_font_sizes=header_font_sizes
|
320 |
+
)
|
321 |
+
|
322 |
+
# Display extracted data as JSON
|
323 |
+
st.json(extracted_data)
|
324 |
+
|
325 |
+
# Convert extracted data to a pandas DataFrame
|
326 |
+
def extract_to_dataframe(data):
|
327 |
+
rows = []
|
328 |
+
for item in data:
|
329 |
+
if 'header' in item:
|
330 |
+
header = item['header']
|
331 |
+
for content_item in item['content']:
|
332 |
+
if content_item['type'] == 'text':
|
333 |
+
rows.append({'Header': header, 'Content': content_item['content']})
|
334 |
+
elif content_item['type'] == 'image':
|
335 |
+
rows.append({'Header': header, 'Content': f"Image: {content_item['path']}"})
|
336 |
+
elif 'page' in item:
|
337 |
+
page_num = item['page']
|
338 |
+
for content_item in item['content']:
|
339 |
+
if content_item['type'] == 'text':
|
340 |
+
rows.append({'Page': page_num, 'Content': content_item['content']})
|
341 |
+
elif content_item['type'] == 'image':
|
342 |
+
rows.append({'Page': page_num, 'Content': f"Image: {content_item['path']}"})
|
343 |
+
return pd.DataFrame(rows)
|
344 |
+
|
345 |
+
df = extract_to_dataframe(extracted_data)
|
346 |
+
|
347 |
+
# Save DataFrame to an in-memory BytesIO buffer
|
348 |
+
buffer = io.BytesIO()
|
349 |
+
with pd.ExcelWriter(buffer, engine='xlsxwriter') as writer:
|
350 |
+
df.to_excel(writer, index=False, sheet_name='Extracted Data')
|
351 |
+
buffer.seek(0)
|
352 |
+
|
353 |
+
# Preview the first 5 lines of the XLSX data
|
354 |
+
st.subheader("Preview of Extracted Data (First 5 Lines)")
|
355 |
+
preview_df = pd.read_excel(buffer, sheet_name='Extracted Data')
|
356 |
+
st.dataframe(preview_df.head())
|
357 |
+
|
358 |
+
# Provide download options
|
359 |
+
st.download_button(
|
360 |
+
label="Download JSON",
|
361 |
+
data=json.dumps(extracted_data, ensure_ascii=False),
|
362 |
+
file_name='extracted_data.json',
|
363 |
+
mime='application/json'
|
364 |
+
)
|
365 |
+
|
366 |
+
st.download_button(
|
367 |
+
label="Download XLSX",
|
368 |
+
data=buffer,
|
369 |
+
file_name='extracted_data.xlsx',
|
370 |
+
mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
371 |
+
)
|
372 |
+
|
373 |
+
st.success("Extraction complete. Data displayed as JSON.")
|
374 |
+
|
375 |
+
if __name__ == "__main__":
|
376 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.3.0
|
2 |
+
attrs==24.1.0
|
3 |
+
blinker==1.8.2
|
4 |
+
cachetools==5.4.0
|
5 |
+
certifi==2024.7.4
|
6 |
+
cffi==1.16.0
|
7 |
+
charset-normalizer==3.3.2
|
8 |
+
click==8.1.7
|
9 |
+
colorama==0.4.6
|
10 |
+
cryptography==43.0.0
|
11 |
+
et-xmlfile==1.1.0
|
12 |
+
gitdb==4.0.11
|
13 |
+
GitPython==3.1.43
|
14 |
+
idna==3.7
|
15 |
+
Jinja2==3.1.4
|
16 |
+
jsonschema==4.23.0
|
17 |
+
jsonschema-specifications==2023.12.1
|
18 |
+
markdown-it-py==3.0.0
|
19 |
+
MarkupSafe==2.1.5
|
20 |
+
mdurl==0.1.2
|
21 |
+
numpy==2.0.1
|
22 |
+
openpyxl==3.1.5
|
23 |
+
packaging==24.1
|
24 |
+
pandas==2.2.2
|
25 |
+
pdfminer.six==20231228
|
26 |
+
pdfplumber==0.11.2
|
27 |
+
pillow==10.4.0
|
28 |
+
protobuf==5.27.3
|
29 |
+
pyarrow==17.0.0
|
30 |
+
pycparser==2.22
|
31 |
+
pydeck==0.9.1
|
32 |
+
Pygments==2.18.0
|
33 |
+
PyMuPDF==1.24.9
|
34 |
+
PyMuPDFb==1.24.9
|
35 |
+
pypdfium2==4.30.0
|
36 |
+
python-dateutil==2.9.0.post0
|
37 |
+
pytz==2024.1
|
38 |
+
referencing==0.35.1
|
39 |
+
requests==2.32.3
|
40 |
+
rich==13.7.1
|
41 |
+
rpds-py==0.19.1
|
42 |
+
six==1.16.0
|
43 |
+
smmap==5.0.1
|
44 |
+
streamlit==1.37.0
|
45 |
+
tenacity==8.5.0
|
46 |
+
toml==0.10.2
|
47 |
+
toolz==0.12.1
|
48 |
+
tornado==6.4.1
|
49 |
+
typing_extensions==4.12.2
|
50 |
+
tzdata==2024.1
|
51 |
+
urllib3==2.2.2
|
52 |
+
watchdog==4.0.1
|
53 |
+
XlsxWriter==3.2.0
|