Spaces:

arithescientist
/

lincolnlegal

Running

Ari commited on Sep 5, 2024

Commit

ec8c26c

verified ·

1 Parent(s): ac28e59

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ nltk.download('punkt')
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-# Function to convert DOCX to PDF
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
     full_text = []
@@ -23,7 +23,12 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     # Create a PDF and write the extracted text
     pdf = FPDF()
     pdf.add_page()
-    pdf.set_font("Times", size=12)
     pdf.multi_cell(190, 10, txt="\n".join(full_text), align='C')
     pdf.output(output_pdf)
     return output_pdf

 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+# Function to convert DOCX to PDF with UTF-8 support
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
     full_text = []
     # Create a PDF and write the extracted text
     pdf = FPDF()
     pdf.add_page()
+    # Set a UTF-8 compatible font (DejaVuSans)
+    pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)
+    pdf.set_font("DejaVu", size=12)
+    # Write the content, ensuring UTF-8 encoding is supported
     pdf.multi_cell(190, 10, txt="\n".join(full_text), align='C')
     pdf.output(output_pdf)
     return output_pdf