dwipper commited on
Commit
cbbfccc
·
1 Parent(s): 7a2985c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -2
app.py CHANGED
@@ -7,8 +7,9 @@ import json
7
  from jinja2 import Template
8
  import requests
9
  from pdfminer.high_level import extract_text
10
- #import pdfkit
11
  import pdfkit
 
12
  # Initialize OpenAI
13
  openai.api_key = os.environ.get('OPENAI_API_KEY')
14
 
@@ -318,10 +319,28 @@ def login_auth(username, password):
318
 
319
  return False
320
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
  def pdf_to_text(contract_file_cmpt, contract_text_tbox, file_name_tbox):
323
 
324
- file_text = extract_text(contract_file_cmpt.name)
 
325
  original_file_name = contract_file_cmpt.name.split("/")[-1]
326
  redline_file_name = original_file_name.split(".")[0]+" Redline.pdf"
327
 
 
7
  from jinja2 import Template
8
  import requests
9
  from pdfminer.high_level import extract_text
10
+ import fitz
11
  import pdfkit
12
+
13
  # Initialize OpenAI
14
  openai.api_key = os.environ.get('OPENAI_API_KEY')
15
 
 
319
 
320
  return False
321
 
322
+ def extract_text_with_spacing(pdf_path):
323
+ document = fitz.open(pdf_path)
324
+ all_text = []
325
+
326
+ for page in document:
327
+ # Extract text in a dict structure
328
+ blocks = page.get_text("dict")["blocks"]
329
+
330
+ for b in blocks:
331
+ if "lines" in b: # Check if the block contains lines of text
332
+ for line in b["lines"]:
333
+ span_texts = [span["text"] for span in line["spans"]]
334
+ all_text.append(" ".join(span_texts))
335
+ all_text.append("\n") # Presume a new block is a new paragraph
336
+
337
+ document.close()
338
+ return "\n".join(all_text)
339
 
340
  def pdf_to_text(contract_file_cmpt, contract_text_tbox, file_name_tbox):
341
 
342
+ file_text = extract_text_with_spacing(contract_file_cmpt.name)
343
+ #file_text = extract_text(contract_file_cmpt.name)
344
  original_file_name = contract_file_cmpt.name.split("/")[-1]
345
  redline_file_name = original_file_name.split(".")[0]+" Redline.pdf"
346