Ari commited on
Commit
82e6a9a
1 Parent(s): ec8c26c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -11
app.py CHANGED
@@ -6,6 +6,8 @@ from fpdf import FPDF
6
  from gtts import gTTS
7
  from pdfminer.high_level import extract_text
8
  from docx import Document
 
 
9
 
10
  nltk.download('punkt')
11
 
@@ -13,24 +15,24 @@ nltk.download('punkt')
13
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
14
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
15
 
16
- # Function to convert DOCX to PDF with UTF-8 support
17
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
18
  doc = Document(docx_file)
19
  full_text = []
20
  for para in doc.paragraphs:
21
  full_text.append(para.text)
22
 
23
- # Create a PDF and write the extracted text
24
- pdf = FPDF()
25
- pdf.add_page()
26
-
27
- # Set a UTF-8 compatible font (DejaVuSans)
28
- pdf.add_font('DejaVu', '', 'DejaVuSans.ttf', uni=True)
29
- pdf.set_font("DejaVu", size=12)
 
30
 
31
- # Write the content, ensuring UTF-8 encoding is supported
32
- pdf.multi_cell(190, 10, txt="\n".join(full_text), align='C')
33
- pdf.output(output_pdf)
34
  return output_pdf
35
 
36
  # Main processing function
 
6
  from gtts import gTTS
7
  from pdfminer.high_level import extract_text
8
  from docx import Document
9
+ from reportlab.lib.pagesizes import letter
10
+ from reportlab.pdfgen import canvas
11
 
12
  nltk.download('punkt')
13
 
 
15
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
16
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
17
 
18
+ # Function to convert DOCX to PDF using reportlab (UTF-8 compatible)
19
  def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
20
  doc = Document(docx_file)
21
  full_text = []
22
  for para in doc.paragraphs:
23
  full_text.append(para.text)
24
 
25
+ # Create a PDF and write the extracted text using reportlab
26
+ pdf = canvas.Canvas(output_pdf, pagesize=letter)
27
+ pdf.setFont("Helvetica", 12)
28
+
29
+ # Write text line by line
30
+ text = pdf.beginText(40, 750) # Start position on the page
31
+ for line in full_text:
32
+ text.textLine(line)
33
 
34
+ pdf.drawText(text)
35
+ pdf.save()
 
36
  return output_pdf
37
 
38
  # Main processing function