Spaces:
Sleeping
Sleeping
Commit
·
71cefa8
1
Parent(s):
f351a77
Update app.py
Browse files
app.py
CHANGED
@@ -27,36 +27,25 @@ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom
|
|
27 |
|
28 |
|
29 |
def pdf(file):
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
pages= pdf2image.convert_from_path(pdf_path=files, dpi=400, size=(1654,2340))
|
34 |
-
content = ""
|
35 |
-
dir_name = 'images/' + files + '/'
|
36 |
-
os.makedirs(dir_name, exist_ok=True)
|
37 |
-
# If folder doesn't exist, then create it.
|
38 |
-
for i in range(len(pages)-1):
|
39 |
-
pages[i].save(dir_name + str(i) + '.jpg')
|
40 |
-
# OCR the image using Google's tesseract
|
41 |
-
content += pt.image_to_string(pages[i])
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
output_text = output_text .replace('..','.')
|
48 |
|
49 |
-
|
50 |
|
51 |
-
|
52 |
-
pdf.add_page()
|
53 |
-
pdf.set_font("Times", size = 12)
|
54 |
|
55 |
-
|
56 |
-
f = all_text2
|
57 |
|
|
|
|
|
58 |
|
59 |
-
return
|
60 |
|
61 |
|
62 |
|
|
|
27 |
|
28 |
|
29 |
def pdf(file):
|
30 |
+
temp = open(file, 'rb')
|
31 |
+
PDF_read = PDFFileReader(temp)
|
32 |
+
first_page = PDF_read.getPage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
|
35 |
+
pdfFileObject = open(file, 'rb')
|
36 |
+
|
37 |
+
pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
|
|
|
38 |
|
39 |
+
print(" No. Of Pages :", pdfReader.numPages)
|
40 |
|
41 |
+
pageObject = pdfReader.getPage(0)
|
|
|
|
|
42 |
|
43 |
+
print(pageObject.extractText())
|
|
|
44 |
|
45 |
+
|
46 |
+
|
47 |
|
48 |
+
return pageObject
|
49 |
|
50 |
|
51 |
|