Spaces:

arithescientist
/

lincolnlegal

Sleeping

arithescientist commited on Jun 1, 2022

Commit

71cefa8

1 Parent(s): f351a77

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,36 +27,25 @@ bert_legal_model = Summarizer(custom_model=custom_model, custom_tokenizer=custom
 def pdf(file):
-    with open(file, 'r', encoding='utf-8') as files:
-        lines = files.readlines()
-    pages= pdf2image.convert_from_path(pdf_path=files, dpi=400, size=(1654,2340))
-    content = ""
-    dir_name = 'images/' + files + '/'
-    os.makedirs(dir_name, exist_ok=True)
-      # If folder doesn't exist, then create it.
-    for i in range(len(pages)-1):
-        pages[i].save(dir_name + str(i) + '.jpg')
-          # OCR the image using Google's tesseract
-        content += pt.image_to_string(pages[i])
-    output_text= bert_legal_model(content,  min_length = 8, ratio = 0.05)
-    output_text = output_text.replace('   ',' ')
-    output_text = output_text .replace(',.',',')
-    output_text = output_text .replace('\n','  ')
-    output_text = output_text .replace('..','.')
-    pdf = FPDF()
-    # Add a page
-    pdf.add_page()
-    pdf.set_font("Times", size = 12)
-    # open the text file in read mode
-    f = all_text2
-    return pdf.output("legal.pdf")

 def pdf(file):
+    temp = open(file, 'rb')
+    PDF_read = PDFFileReader(temp)
+    first_page = PDF_read.getPage
+    pdfFileObject = open(file, 'rb')
+    pdfReader = PyPDF2.PdfFileReader(pdfFileObject)
+    print(" No. Of Pages :", pdfReader.numPages)
+    pageObject = pdfReader.getPage(0)
+    print(pageObject.extractText())
+    return pageObject