import io import os import pypdf from collections import Counter def limit_pagenumbers(filepath, pages_limit=20): num_pages = 0 is_encrypted = False with open(filepath, "rb") as file: reader = pypdf.PdfReader(file) num_pages = len(reader.pages) is_encrypted = reader.is_encrypted if num_pages > pages_limit or is_encrypted: # Create a PDF writer object writer = pypdf.PdfWriter() for page_num in range(num_pages): # Add to writer if page number <= 20 if page_num < pages_limit: writer.add_page(reader.pages[page_num]) # Create a temporary buffer to write the modified content temp_buffer = io.BytesIO() writer.write(temp_buffer) if num_pages > pages_limit or is_encrypted: # Write the limited pages to the original file with open(filepath, "wb") as output_file: # Set the buffer position back to the beginning temp_buffer.seek(0) # Write the buffer content to the output file output_file.write(temp_buffer.read()) def majority_vote_dicts(dicts): combined_dict = {} for d in dicts: for k, v in d.items(): if k in ['page_number', 'text_sequence']: continue if k not in combined_dict: combined_dict[k] = Counter() if type(v) == str: combined_dict[k][v] += 1 else: combined_dict[k][str(v)] += 1 result_dict = {} for k, v in combined_dict.items(): result_dict[k] = v.most_common(1)[0][0] return result_dict