Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

Ari commited on Sep 5, 2024

Commit

deb55dd

•

1 Parent(s): c0d316e

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -27

app.py CHANGED Viewed

@@ -9,28 +9,40 @@ from docx import Document
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
-# Switch to a more lightweight model like DistilBART for faster processing
-tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
-model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
-# Function to chunk text based on token length
 def chunk_text(text, max_token_len=512):
-    tokens = tokenizer(text, return_tensors="pt", truncation=False, padding=False)["input_ids"].squeeze()
-    total_length = len(tokens)
-    # Split text into manageable token chunks
-    chunks = [tokens[i:i+max_token_len] for i in range(0, total_length, max_token_len)]
     return chunks
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
-    full_text = []
-    for para in doc.paragraphs:
-        full_text.append(para.text)
     pdf = canvas.Canvas(output_pdf, pagesize=letter)
     pdf.setFont("Helvetica", 12)
     text = pdf.beginText(40, 750)
     for line in full_text:
         text.textLine(line)
@@ -38,13 +50,13 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     pdf.save()
     return output_pdf
-# Summarize each chunk of tokens
 def summarize_chunk(chunk, min_length=50, max_length=150):
-    inputs = {"input_ids": chunk.unsqueeze(0)}  # Add batch dimension
-    summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=min_length, max_length=max_length)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
-# Main processing function
 def pdf_to_text(text, PDF, min_length=50):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -55,33 +67,31 @@ def pdf_to_text(text, PDF, min_length=50):
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
-        # Split text into token-based chunks
         chunks = chunk_text(text)
-        summarized_text = ""
-        # Summarize each chunk
-        for chunk in chunks:
-            summarized_text += summarize_chunk(chunk, min_length=min_length, max_length=min_length + 100) + "\n\n"
         # Save summarized text to PDF
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
-        pdf.multi_cell(190, 10, txt=summarized_text, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         # Convert summarized text to audio
         audio_output_path = "legal.wav"
-        tts = gTTS(text=summarized_text, lang='en', slow=False)
         tts.save(audio_output_path)
-        return audio_output_path, summarized_text, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
-# Preloaded document processor
 def process_sample_document(min_length=50):
     sample_document_path = "Marbury v. Madison.pdf"

 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
+# Use LegalBERT for handling legal documents
+tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
+model = AutoModelForSeq2SeqLM.from_pretrained("nlpaueb/legal-bert-base-uncased")
+# Function to chunk the text into manageable pieces
 def chunk_text(text, max_token_len=512):
+    sentences = re.split(r'(?<=[.!?]) +', text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for sentence in sentences:
+        tokens = tokenizer.tokenize(sentence)
+        if current_length + len(tokens) <= max_token_len:
+            current_chunk.append(sentence)
+            current_length += len(tokens)
+        else:
+            chunks.append(" ".join(current_chunk))
+            current_chunk = [sentence]
+            current_length = len(tokens)
+    if current_chunk:
+        chunks.append(" ".join(current_chunk))
     return chunks
 def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
     doc = Document(docx_file)
+    full_text = [para.text for para in doc.paragraphs]
     pdf = canvas.Canvas(output_pdf, pagesize=letter)
     pdf.setFont("Helvetica", 12)
     text = pdf.beginText(40, 750)
     for line in full_text:
         text.textLine(line)
     pdf.save()
     return output_pdf
+# Summarize each chunk and then recursively summarize the summaries
 def summarize_chunk(chunk, min_length=50, max_length=150):
+    inputs = tokenizer([chunk], max_length=512, truncation=True, return_tensors="pt")
+    summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=max_length)
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+# Main processing function using recursive summarization
 def pdf_to_text(text, PDF, min_length=50):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
         chunks = chunk_text(text)
+        summarized_chunks = [summarize_chunk(chunk, min_length=min_length) for chunk in chunks]
+        # Combine summaries and recursively summarize the combined text
+        summarized_text = " ".join(summarized_chunks)
+        final_summary = summarize_chunk(summarized_text, min_length=min_length, max_length=min_length+150)
         # Save summarized text to PDF
         pdf = FPDF()
         pdf.add_page()
         pdf.set_font("Times", size=12)
+        pdf.multi_cell(190, 10, txt=final_summary, align='C')
         pdf_output_path = "legal.pdf"
         pdf.output(pdf_output_path)
         # Convert summarized text to audio
         audio_output_path = "legal.wav"
+        tts = gTTS(text=final_summary, lang='en', slow=False)
         tts.save(audio_output_path)
+        return audio_output_path, final_summary, pdf_output_path
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
 def process_sample_document(min_length=50):
     sample_document_path = "Marbury v. Madison.pdf"