harish3110's picture
Updated helper to fix error when value is a dict
c95e708
import io
import os
import pypdf
from collections import Counter
def limit_pagenumbers(filepath, pages_limit=20):
num_pages = 0
is_encrypted = False
with open(filepath, "rb") as file:
reader = pypdf.PdfReader(file)
num_pages = len(reader.pages)
is_encrypted = reader.is_encrypted
if num_pages > pages_limit or is_encrypted:
# Create a PDF writer object
writer = pypdf.PdfWriter()
for page_num in range(num_pages):
# Add to writer if page number <= 20
if page_num < pages_limit:
writer.add_page(reader.pages[page_num])
# Create a temporary buffer to write the modified content
temp_buffer = io.BytesIO()
writer.write(temp_buffer)
if num_pages > pages_limit or is_encrypted:
# Write the limited pages to the original file
with open(filepath, "wb") as output_file:
# Set the buffer position back to the beginning
temp_buffer.seek(0)
# Write the buffer content to the output file
output_file.write(temp_buffer.read())
def majority_vote_dicts(dicts):
combined_dict = {}
for d in dicts:
for k, v in d.items():
if k in ['page_number', 'text_sequence']:
continue
if k not in combined_dict:
combined_dict[k] = Counter()
if type(v) == str:
combined_dict[k][v] += 1
else:
combined_dict[k][str(v)] += 1
result_dict = {}
for k, v in combined_dict.items():
result_dict[k] = v.most_common(1)[0][0]
return result_dict