Spaces:
Sleeping
Sleeping
Ari
commited on
Commit
•
82e6a9a
1
Parent(s):
ec8c26c
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,8 @@ from fpdf import FPDF
|
|
6 |
from gtts import gTTS
|
7 |
from pdfminer.high_level import extract_text
|
8 |
from docx import Document
|
|
|
|
|
9 |
|
10 |
nltk.download('punkt')
|
11 |
|
@@ -13,24 +15,24 @@ nltk.download('punkt')
|
|
13 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
14 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
15 |
|
16 |
-
# Function to convert DOCX to PDF
|
17 |
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
18 |
doc = Document(docx_file)
|
19 |
full_text = []
|
20 |
for para in doc.paragraphs:
|
21 |
full_text.append(para.text)
|
22 |
|
23 |
-
# Create a PDF and write the extracted text
|
24 |
-
pdf =
|
25 |
-
pdf.
|
26 |
-
|
27 |
-
#
|
28 |
-
pdf.
|
29 |
-
|
|
|
30 |
|
31 |
-
|
32 |
-
pdf.
|
33 |
-
pdf.output(output_pdf)
|
34 |
return output_pdf
|
35 |
|
36 |
# Main processing function
|
|
|
6 |
from gtts import gTTS
|
7 |
from pdfminer.high_level import extract_text
|
8 |
from docx import Document
|
9 |
+
from reportlab.lib.pagesizes import letter
|
10 |
+
from reportlab.pdfgen import canvas
|
11 |
|
12 |
nltk.download('punkt')
|
13 |
|
|
|
15 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
16 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
17 |
|
18 |
+
# Function to convert DOCX to PDF using reportlab (UTF-8 compatible)
|
19 |
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
20 |
doc = Document(docx_file)
|
21 |
full_text = []
|
22 |
for para in doc.paragraphs:
|
23 |
full_text.append(para.text)
|
24 |
|
25 |
+
# Create a PDF and write the extracted text using reportlab
|
26 |
+
pdf = canvas.Canvas(output_pdf, pagesize=letter)
|
27 |
+
pdf.setFont("Helvetica", 12)
|
28 |
+
|
29 |
+
# Write text line by line
|
30 |
+
text = pdf.beginText(40, 750) # Start position on the page
|
31 |
+
for line in full_text:
|
32 |
+
text.textLine(line)
|
33 |
|
34 |
+
pdf.drawText(text)
|
35 |
+
pdf.save()
|
|
|
36 |
return output_pdf
|
37 |
|
38 |
# Main processing function
|