alps / ocr_component2.py
yumikimi381's picture
Upload folder using huggingface_hub
daf0288 verified
import pdfplumber
if __name__ == '__main__':
#input can be path to your PDF file or file object, loaded as bytes
with pdfplumber.open('/myhome/alps/TestingFiles/OCRTest1German.pdf') as pdf:
for page in pdf.pages:
im = page.to_image()
#extract_words() - Returns a list of all word-looking things and their bounding boxes
#Example:
#[{'text': 'Inhaltsverzeichnis', 'x0': 33.99, 'x1': 111.77713499999999, 'top': 36.59723999999994, 'doctop': 36.59723999999994, 'bottom': 46.58723999999995, 'upright': True, 'height': 9.990000000000009, 'width': 77.78713499999998, 'direction': 'ltr'}, {'text': 'Übersicht', 'x0': 33.99, 'x1': 71.4912, 't
extracted_words = page.extract_words()
print(extracted_words)
for word in extracted_words:
print(word['text'])
word['x0']
"""
Using the Page.extract_text(...) method, we grab every character on the page, line by line, using keep_blank_chars=True to retain all those whitespace characters as literal characters:
text = p0.extract_text(keep_blank_chars=True)
"""