anmolsahai commited on
Commit
507a7c5
·
1 Parent(s): 42b2650
Files changed (1) hide show
  1. app.py +7 -1
app.py CHANGED
@@ -5,6 +5,7 @@ from docx import Document
5
  from difflib import unified_diff
6
  import tempfile
7
  from docx.shared import RGBColor
 
8
 
9
  def pdf_to_text_with_layout(pdf_file):
10
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
@@ -14,10 +15,15 @@ def pdf_to_text_with_layout(pdf_file):
14
  text.append(page.get_text("text"))
15
  return "\n".join(text)
16
 
 
 
 
 
17
  def text_to_word_with_formatting(text, word_path):
18
  doc = Document()
19
  for line in text.split("\n"):
20
- doc.add_paragraph(line)
 
21
  doc.save(word_path)
22
 
23
  def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):
 
5
  from difflib import unified_diff
6
  import tempfile
7
  from docx.shared import RGBColor
8
+ import re
9
 
10
  def pdf_to_text_with_layout(pdf_file):
11
  doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
 
15
  text.append(page.get_text("text"))
16
  return "\n".join(text)
17
 
18
+ def clean_text(text):
19
+ # Remove non-XML-compatible characters
20
+ return re.sub(r'[^\x09\x0A\x0D\x20-\xD7FF\xE000-\xFFFD\x10000-\x10FFFF]', '', text)
21
+
22
  def text_to_word_with_formatting(text, word_path):
23
  doc = Document()
24
  for line in text.split("\n"):
25
+ clean_line = clean_text(line)
26
+ doc.add_paragraph(clean_line)
27
  doc.save(word_path)
28
 
29
  def apply_pipeline(file, model_name, balance_type, apsn_transactions, max_fees_per_day, min_overdrawn_fee, min_transaction_overdraft):