Spaces:
Build error
Build error
import pdfplumber | |
if __name__ == '__main__': | |
#input can be path to your PDF file or file object, loaded as bytes | |
with pdfplumber.open('/myhome/alps/TestingFiles/OCRTest1German.pdf') as pdf: | |
for page in pdf.pages: | |
im = page.to_image() | |
#extract_words() - Returns a list of all word-looking things and their bounding boxes | |
#Example: | |
#[{'text': 'Inhaltsverzeichnis', 'x0': 33.99, 'x1': 111.77713499999999, 'top': 36.59723999999994, 'doctop': 36.59723999999994, 'bottom': 46.58723999999995, 'upright': True, 'height': 9.990000000000009, 'width': 77.78713499999998, 'direction': 'ltr'}, {'text': 'Übersicht', 'x0': 33.99, 'x1': 71.4912, 't | |
extracted_words = page.extract_words() | |
print(extracted_words) | |
for word in extracted_words: | |
print(word['text']) | |
word['x0'] | |
""" | |
Using the Page.extract_text(...) method, we grab every character on the page, line by line, using keep_blank_chars=True to retain all those whitespace characters as literal characters: | |
text = p0.extract_text(keep_blank_chars=True) | |
""" | |