File size: 1,194 Bytes
daf0288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

import pdfplumber

if __name__ == '__main__':
    #input can be path to your PDF file or file object, loaded as bytes
    with pdfplumber.open('/myhome/alps/TestingFiles/OCRTest1German.pdf') as pdf:
        for page in pdf.pages:
            im = page.to_image()
            #extract_words() - Returns a list of all word-looking things and their bounding boxes
            #Example:
            #[{'text': 'Inhaltsverzeichnis', 'x0': 33.99, 'x1': 111.77713499999999, 'top': 36.59723999999994, 'doctop': 36.59723999999994, 'bottom': 46.58723999999995, 'upright': True, 'height': 9.990000000000009, 'width': 77.78713499999998, 'direction': 'ltr'}, {'text': 'Übersicht', 'x0': 33.99, 'x1': 71.4912, 't
            
            extracted_words = page.extract_words()
            print(extracted_words)
            for word in extracted_words:
                print(word['text'])
                word['x0']


            """
            Using the Page.extract_text(...) method, we grab every character on the page, line by line, using keep_blank_chars=True to retain all those whitespace characters as literal characters:

            text = p0.extract_text(keep_blank_chars=True)
            """