arithescientist commited on
Commit
71cefa8
·
1 Parent(s): f351a77

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -24
app.py CHANGED
@@ -27,36 +27,25 @@ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom
27
 
28
 
29
  def pdf(file):
30
- with open(file, 'r', encoding='utf-8') as files:
31
- lines = files.readlines()
32
-
33
- pages= pdf2image.convert_from_path(pdf_path=files, dpi=400, size=(1654,2340))
34
- content = ""
35
- dir_name = 'images/' + files + '/'
36
- os.makedirs(dir_name, exist_ok=True)
37
- # If folder doesn't exist, then create it.
38
- for i in range(len(pages)-1):
39
- pages[i].save(dir_name + str(i) + '.jpg')
40
- # OCR the image using Google's tesseract
41
- content += pt.image_to_string(pages[i])
42
 
43
- output_text= bert_legal_model(content, min_length = 8, ratio = 0.05)
44
- output_text = output_text.replace(' ',' ')
45
- output_text = output_text .replace(',.',',')
46
- output_text = output_text .replace('\n',' ')
47
- output_text = output_text .replace('..','.')
48
 
49
- pdf = FPDF()
50
 
51
- # Add a page
52
- pdf.add_page()
53
- pdf.set_font("Times", size = 12)
54
 
55
- # open the text file in read mode
56
- f = all_text2
57
 
 
 
58
 
59
- return pdf.output("legal.pdf")
60
 
61
 
62
 
 
27
 
28
 
29
  def pdf(file):
30
+ temp = open(file, 'rb')
31
+ PDF_read = PDFFileReader(temp)
32
+ first_page = PDF_read.getPage
 
 
 
 
 
 
 
 
 
33
 
34
+
35
+ pdfFileObject = open(file, 'rb')
36
+
37
+ pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
 
38
 
39
+ print(" No. Of Pages :", pdfReader.numPages)
40
 
41
+ pageObject = pdfReader.getPage(0)
 
 
42
 
43
+ print(pageObject.extractText())
 
44
 
45
+
46
+
47
 
48
+ return pageObject
49
 
50
 
51