Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,8 +7,9 @@ import json
|
|
7 |
from jinja2 import Template
|
8 |
import requests
|
9 |
from pdfminer.high_level import extract_text
|
10 |
-
|
11 |
import pdfkit
|
|
|
12 |
# Initialize OpenAI
|
13 |
openai.api_key = os.environ.get('OPENAI_API_KEY')
|
14 |
|
@@ -318,10 +319,28 @@ def login_auth(username, password):
|
|
318 |
|
319 |
return False
|
320 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
|
322 |
def pdf_to_text(contract_file_cmpt, contract_text_tbox, file_name_tbox):
|
323 |
|
324 |
-
file_text =
|
|
|
325 |
original_file_name = contract_file_cmpt.name.split("/")[-1]
|
326 |
redline_file_name = original_file_name.split(".")[0]+" Redline.pdf"
|
327 |
|
|
|
7 |
from jinja2 import Template
|
8 |
import requests
|
9 |
from pdfminer.high_level import extract_text
|
10 |
+
import fitz
|
11 |
import pdfkit
|
12 |
+
|
13 |
# Initialize OpenAI
|
14 |
openai.api_key = os.environ.get('OPENAI_API_KEY')
|
15 |
|
|
|
319 |
|
320 |
return False
|
321 |
|
322 |
+
def extract_text_with_spacing(pdf_path):
|
323 |
+
document = fitz.open(pdf_path)
|
324 |
+
all_text = []
|
325 |
+
|
326 |
+
for page in document:
|
327 |
+
# Extract text in a dict structure
|
328 |
+
blocks = page.get_text("dict")["blocks"]
|
329 |
+
|
330 |
+
for b in blocks:
|
331 |
+
if "lines" in b: # Check if the block contains lines of text
|
332 |
+
for line in b["lines"]:
|
333 |
+
span_texts = [span["text"] for span in line["spans"]]
|
334 |
+
all_text.append(" ".join(span_texts))
|
335 |
+
all_text.append("\n") # Presume a new block is a new paragraph
|
336 |
+
|
337 |
+
document.close()
|
338 |
+
return "\n".join(all_text)
|
339 |
|
340 |
def pdf_to_text(contract_file_cmpt, contract_text_tbox, file_name_tbox):
|
341 |
|
342 |
+
file_text = extract_text_with_spacing(contract_file_cmpt.name)
|
343 |
+
#file_text = extract_text(contract_file_cmpt.name)
|
344 |
original_file_name = contract_file_cmpt.name.split("/")[-1]
|
345 |
redline_file_name = original_file_name.split(".")[0]+" Redline.pdf"
|
346 |
|