Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

Ari commited on Sep 5, 2024

Commit

9d0e6a8

•

1 Parent(s): c8bcda4

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -12

app.py CHANGED Viewed

@@ -14,6 +14,24 @@ nltk.download('punkt')
 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
     full_text = []
@@ -31,7 +49,8 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     pdf.save()
     return output_pdf
-def pdf_to_text(text, PDF, min_length=80):  # Increase default min_length by 4 times
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -41,25 +60,28 @@ def pdf_to_text(text, PDF, min_length=80):  # Increase default min_length by 4 t
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
-        inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors="pt")
-        min_length = int(min_length)
-        # Explicitly setting clean_up_tokenization_spaces=True to match the future default behavior
-        summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length + 4000)
-        output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
-        pdf.multi_cell(190, 10, txt=output_text, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         audio_output_path = "legal.wav"
-        tts = gTTS(text=output_text, lang='en', slow=False)
         tts.save(audio_output_path)
-        return audio_output_path, output_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
@@ -76,7 +98,7 @@ with gr.Blocks() as iface:
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
-    slider = gr.Slider(minimum=10, maximum=400, step=10, value=80, label="Summary Minimum Length")  # Default value set to 80
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")

 tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
 model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
+# Function to split text into chunks
+def split_text(text, max_chunk_size=1024):
+    sentences = nltk.sent_tokenize(text)
+    chunks = []
+    chunk = ""
+    for sentence in sentences:
+        if len(chunk) + len(sentence) <= max_chunk_size:
+            chunk += sentence + " "
+        else:
+            chunks.append(chunk.strip())
+            chunk = sentence + " "
+    if chunk:
+        chunks.append(chunk.strip())
+    return chunks
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
     full_text = []
     pdf.save()
     return output_pdf
+# Main processing function with text chunking
+def pdf_to_text(text, PDF, min_length=80):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
+        chunks = split_text(text)
+        summarized_text = ""
+        for chunk in chunks:
+            inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
+            min_length = int(min_length)
+            summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length + 400)
+            output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
+            summarized_text += output_text + "\n\n"
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
+        pdf.multi_cell(190, 10, txt=summarized_text, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         audio_output_path = "legal.wav"
+        tts = gTTS(text=summarized_text, lang='en', slow=False)
         tts.save(audio_output_path)
+        return audio_output_path, summarized_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
+    slider = gr.Slider(minimum=10, maximum=400, step=10, value=80, label="Summary Minimum Length")
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")