Spaces:

arithescientist
/

lincolnlegal

Sleeping

Ari commited on Sep 5, 2024

Commit

abdc1ac

•

1 Parent(s): 99a5022

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,22 +11,18 @@ from reportlab.pdfgen import canvas
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-# Function to split text into chunks based on paragraphs
-def split_text_by_paragraph(text, max_chunk_size=1024):
-    paragraphs = text.split("\n\n")  # Splitting by paragraphs
     chunks = []
-    chunk = ""
-    for paragraph in paragraphs:
-        if len(chunk) + len(paragraph) <= max_chunk_size:
-            chunk += paragraph + "\n\n"
-        else:
-            chunks.append(chunk.strip())
-            chunk = paragraph + "\n\n"
-    if chunk:
-        chunks.append(chunk.strip())
     return chunks
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
@@ -46,7 +42,7 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     pdf.save()
     return output_pdf
-# Main processing function with paragraph-based text chunking
 def pdf_to_text(text, PDF, min_length=80):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -57,7 +53,7 @@ def pdf_to_text(text, PDF, min_length=80):
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
-        chunks = split_text_by_paragraph(text)
         summarized_text = ""
         for chunk in chunks:

 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+# Function to split text into chunks based on token length
+def split_text_by_tokens(text, max_length=1024):
+    tokens = tokenizer.encode(text, return_tensors="pt", truncation=False)
+    total_length = tokens.shape[1]
     chunks = []
+    # Loop through the text, grabbing chunks of tokens
+    for i in range(0, total_length, max_length):
+        chunk_tokens = tokens[:, i:i+max_length]
+        chunk_text = tokenizer.decode(chunk_tokens[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        chunks.append(chunk_text)
     return chunks
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     pdf.save()
     return output_pdf
+# Main processing function with token-based text chunking
 def pdf_to_text(text, PDF, min_length=80):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
+        chunks = split_text_by_tokens(text)
         summarized_text = ""
         for chunk in chunks: