Spaces:
Sleeping
Sleeping
Commit
·
507a7c5
1
Parent(s):
42b2650
bug11
Browse files
app.py
CHANGED
@@ -5,6 +5,7 @@ from docx import Document
|
|
5 |
from difflib import unified_diff
|
6 |
import tempfile
|
7 |
from docx.shared import RGBColor
|
|
|
8 |
|
9 |
def pdf_to_text_with_layout(pdf_file):
|
10 |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
@@ -14,10 +15,15 @@ def pdf_to_text_with_layout(pdf_file):
|
|
14 |
text.append(page.get_text("text"))
|
15 |
return "\n".join(text)
|
16 |
|
|
|
|
|
|
|
|
|
17 |
def text_to_word_with_formatting(text, word_path):
|
18 |
doc = Document()
|
19 |
for line in text.split("\n"):
|
20 |
-
|
|
|
21 |
doc.save(word_path)
|
22 |
|
23 |
def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
|
|
|
5 |
from difflib import unified_diff
|
6 |
import tempfile
|
7 |
from docx.shared import RGBColor
|
8 |
+
import re
|
9 |
|
10 |
def pdf_to_text_with_layout(pdf_file):
|
11 |
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
|
|
|
15 |
text.append(page.get_text("text"))
|
16 |
return "\n".join(text)
|
17 |
|
18 |
+
def clean_text(text):
|
19 |
+
# Remove non-XML-compatible characters
|
20 |
+
return re.sub(r'[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\x10000-\x10FFFF]', '', text)
|
21 |
+
|
22 |
def text_to_word_with_formatting(text, word_path):
|
23 |
doc = Document()
|
24 |
for line in text.split("\n"):
|
25 |
+
clean_line = clean_text(line)
|
26 |
+
doc.add_paragraph(clean_line)
|
27 |
doc.save(word_path)
|
28 |
|
29 |
def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
|