Spaces:

arithescientist
/

lincolnlegal

Sleeping

App Files Files Community

Ari commited on Sep 5, 2024

Commit

c0d316e

•

1 Parent(s): f336636

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -38

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
 import os
 import re
-import numpy as np
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from fpdf import FPDF
 from gtts import gTTS
@@ -9,43 +8,44 @@ from pdfminer.high_level import extract_text
 from docx import Document
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
-from concurrent.futures import ThreadPoolExecutor
-tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
-model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
-# Function to chunk text into sentence-based chunks
-def chunk_text(text, max_token_len=1024):
-    sentences = [sent.strip() + '.' for sent in re.split(r'(?<!\d)\.\s', text) if len(sent) > 1]
-    token_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences]
-    chunk_size = max_token_len
-    chunks = []
-    current_chunk = []
-    current_length = 0
-    for sent, length in zip(sentences, token_lengths):
-        if current_length + length <= chunk_size:
-            current_chunk.append(sent)
-            current_length += length
-        else:
-            chunks.append(" ".join(current_chunk))
-            current_chunk = [sent]
-            current_length = length
-    if current_chunk:
-        chunks.append(" ".join(current_chunk))
-    return chunks
-# Summarization function
-def summarize_chunk(chunk, min_length=80):
-    inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
-    summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=min_length, max_length=min_length + 300)
-    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
-# Main processing function using parallel summarization
-def pdf_to_text(text, PDF, min_length=80):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
@@ -55,13 +55,13 @@ def pdf_to_text(text, PDF, min_length=80):
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
         chunks = chunk_text(text)
         summarized_text = ""
-        # Parallelize summarization using ThreadPoolExecutor
-        with ThreadPoolExecutor() as executor:
-            summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, min_length), chunks))
-            summarized_text = "\n\n".join(summaries)
         # Save summarized text to PDF
         pdf = FPDF()
@@ -81,7 +81,8 @@ def pdf_to_text(text, PDF, min_length=80):
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
-def process_sample_document(min_length=80):
     sample_document_path = "Marbury v. Madison.pdf"
     with open(sample_document_path, "rb") as f:
@@ -90,11 +91,11 @@ def process_sample_document(min_length=80):
 # Gradio interface
 with gr.Blocks() as iface:
     with gr.Row():
-        process_sample_button = gr.Button("Summarize Pre-Uploaded Marbury v. Madison Case Document")
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
-    slider = gr.Slider(minimum=10, maximum=400, step=10, value=80, label="Summary Minimum Length")
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")

 import gradio as gr
 import os
 import re
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 from fpdf import FPDF
 from gtts import gTTS
 from docx import Document
 from reportlab.lib.pagesizes import letter
 from reportlab.pdfgen import canvas
+# Switch to a more lightweight model like DistilBART for faster processing
+tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
+model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")
+# Function to chunk text based on token length
+def chunk_text(text, max_token_len=512):
+    tokens = tokenizer(text, return_tensors="pt", truncation=False, padding=False)["input_ids"].squeeze()
+    total_length = len(tokens)
+    # Split text into manageable token chunks
+    chunks = [tokens[i:i+max_token_len] for i in range(0, total_length, max_token_len)]
+    return chunks
+def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
+    doc = Document(docx_file)
+    full_text = []
+    for para in doc.paragraphs:
+        full_text.append(para.text)
+    pdf = canvas.Canvas(output_pdf, pagesize=letter)
+    pdf.setFont("Helvetica", 12)
+    text = pdf.beginText(40, 750)
+    for line in full_text:
+        text.textLine(line)
+    pdf.drawText(text)
+    pdf.save()
+    return output_pdf
+# Summarize each chunk of tokens
+def summarize_chunk(chunk, min_length=50, max_length=150):
+    inputs = {"input_ids": chunk.unsqueeze(0)}  # Add batch dimension
+    summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=min_length, max_length=max_length)
+    return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
+# Main processing function
+def pdf_to_text(text, PDF, min_length=50):
     try:
         file_extension = os.path.splitext(PDF.name)[1].lower()
         elif file_extension == '.pdf' and text == "":
             text = extract_text(PDF.name)
+        # Split text into token-based chunks
         chunks = chunk_text(text)
         summarized_text = ""
+        # Summarize each chunk
+        for chunk in chunks:
+            summarized_text += summarize_chunk(chunk, min_length=min_length, max_length=min_length + 100) + "\n\n"
         # Save summarized text to PDF
         pdf = FPDF()
     except Exception as e:
         return None, f"An error occurred: {str(e)}", None
+# Preloaded document processor
+def process_sample_document(min_length=50):
     sample_document_path = "Marbury v. Madison.pdf"
     with open(sample_document_path, "rb") as f:
 # Gradio interface
 with gr.Blocks() as iface:
     with gr.Row():
+        process_sample_button = gr.Button("Summarize Marbury v. Madison Case Pre-Uploaded")
     text_input = gr.Textbox(label="Input Text")
     file_input = gr.File(label="Upload PDF or DOCX")
+    slider = gr.Slider(minimum=10, maximum=300, step=10, value=50, label="Summary Minimum Length")
     audio_output = gr.Audio(label="Generated Audio")
     summary_output = gr.Textbox(label="Generated Summary")