Spaces:
Sleeping
Sleeping
Ari
commited on
Commit
•
abdc1ac
1
Parent(s):
99a5022
Update app.py
Browse files
app.py
CHANGED
@@ -11,22 +11,18 @@ from reportlab.pdfgen import canvas
|
|
11 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
12 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
13 |
|
14 |
-
# Function to split text into chunks based on
|
15 |
-
def
|
16 |
-
|
|
|
17 |
chunks = []
|
18 |
-
chunk = ""
|
19 |
-
|
20 |
-
for paragraph in paragraphs:
|
21 |
-
if len(chunk) + len(paragraph) <= max_chunk_size:
|
22 |
-
chunk += paragraph + "\n\n"
|
23 |
-
else:
|
24 |
-
chunks.append(chunk.strip())
|
25 |
-
chunk = paragraph + "\n\n"
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
30 |
return chunks
|
31 |
|
32 |
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
@@ -46,7 +42,7 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
|
46 |
pdf.save()
|
47 |
return output_pdf
|
48 |
|
49 |
-
# Main processing function with
|
50 |
def pdf_to_text(text, PDF, min_length=80):
|
51 |
try:
|
52 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
@@ -57,7 +53,7 @@ def pdf_to_text(text, PDF, min_length=80):
|
|
57 |
elif file_extension == '.pdf' and text == "":
|
58 |
text = extract_text(PDF.name)
|
59 |
|
60 |
-
chunks =
|
61 |
summarized_text = ""
|
62 |
|
63 |
for chunk in chunks:
|
|
|
11 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
12 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
13 |
|
14 |
+
# Function to split text into chunks based on token length
|
15 |
+
def split_text_by_tokens(text, max_length=1024):
|
16 |
+
tokens = tokenizer.encode(text, return_tensors="pt", truncation=False)
|
17 |
+
total_length = tokens.shape[1]
|
18 |
chunks = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# Loop through the text, grabbing chunks of tokens
|
21 |
+
for i in range(0, total_length, max_length):
|
22 |
+
chunk_tokens = tokens[:, i:i+max_length]
|
23 |
+
chunk_text = tokenizer.decode(chunk_tokens[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
24 |
+
chunks.append(chunk_text)
|
25 |
+
|
26 |
return chunks
|
27 |
|
28 |
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
|
|
42 |
pdf.save()
|
43 |
return output_pdf
|
44 |
|
45 |
+
# Main processing function with token-based text chunking
|
46 |
def pdf_to_text(text, PDF, min_length=80):
|
47 |
try:
|
48 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
|
|
53 |
elif file_extension == '.pdf' and text == "":
|
54 |
text = extract_text(PDF.name)
|
55 |
|
56 |
+
chunks = split_text_by_tokens(text)
|
57 |
summarized_text = ""
|
58 |
|
59 |
for chunk in chunks:
|