Spaces:
Sleeping
Sleeping
Ari
commited on
Commit
•
f336636
1
Parent(s):
204d8e4
Update app.py
Browse files
app.py
CHANGED
@@ -9,18 +9,16 @@ from pdfminer.high_level import extract_text
|
|
9 |
from docx import Document
|
10 |
from reportlab.lib.pagesizes import letter
|
11 |
from reportlab.pdfgen import canvas
|
|
|
12 |
|
13 |
-
# Load the models and tokenizer
|
14 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
15 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
16 |
|
17 |
# Function to chunk text into sentence-based chunks
|
18 |
def chunk_text(text, max_token_len=1024):
|
19 |
-
# Split text into sentences
|
20 |
sentences = [sent.strip() + '.' for sent in re.split(r'(?<!\d)\.\s', text) if len(sent) > 1]
|
21 |
token_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences]
|
22 |
|
23 |
-
# Initialize chunking
|
24 |
chunk_size = max_token_len
|
25 |
chunks = []
|
26 |
current_chunk = []
|
@@ -40,46 +38,30 @@ def chunk_text(text, max_token_len=1024):
|
|
40 |
|
41 |
return chunks
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
pdf = canvas.Canvas(output_pdf, pagesize=letter)
|
50 |
-
pdf.setFont("Helvetica", 12)
|
51 |
-
|
52 |
-
text = pdf.beginText(40, 750)
|
53 |
-
for line in full_text:
|
54 |
-
text.textLine(line)
|
55 |
-
|
56 |
-
pdf.drawText(text)
|
57 |
-
pdf.save()
|
58 |
-
return output_pdf
|
59 |
|
60 |
-
# Main processing function using
|
61 |
def pdf_to_text(text, PDF, min_length=80):
|
62 |
try:
|
63 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
64 |
|
65 |
-
# If DOCX, convert to PDF
|
66 |
if file_extension == '.docx':
|
67 |
pdf_file_path = docx_to_pdf(PDF.name)
|
68 |
text = extract_text(pdf_file_path)
|
69 |
-
# If PDF, extract text
|
70 |
elif file_extension == '.pdf' and text == "":
|
71 |
text = extract_text(PDF.name)
|
72 |
|
73 |
-
# Split text into chunks based on sentence boundaries
|
74 |
chunks = chunk_text(text)
|
75 |
summarized_text = ""
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
|
82 |
-
summarized_text += output_text + "\n\n"
|
83 |
|
84 |
# Save summarized text to PDF
|
85 |
pdf = FPDF()
|
@@ -99,7 +81,6 @@ def pdf_to_text(text, PDF, min_length=80):
|
|
99 |
except Exception as e:
|
100 |
return None, f"An error occurred: {str(e)}", None
|
101 |
|
102 |
-
# Preloaded document processor
|
103 |
def process_sample_document(min_length=80):
|
104 |
sample_document_path = "Marbury v. Madison.pdf"
|
105 |
|
@@ -109,7 +90,7 @@ def process_sample_document(min_length=80):
|
|
109 |
# Gradio interface
|
110 |
with gr.Blocks() as iface:
|
111 |
with gr.Row():
|
112 |
-
process_sample_button = gr.Button("Summarize Marbury v. Madison Case
|
113 |
|
114 |
text_input = gr.Textbox(label="Input Text")
|
115 |
file_input = gr.File(label="Upload PDF or DOCX")
|
|
|
9 |
from docx import Document
|
10 |
from reportlab.lib.pagesizes import letter
|
11 |
from reportlab.pdfgen import canvas
|
12 |
+
from concurrent.futures import ThreadPoolExecutor
|
13 |
|
|
|
14 |
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
|
15 |
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
|
16 |
|
17 |
# Function to chunk text into sentence-based chunks
|
18 |
def chunk_text(text, max_token_len=1024):
|
|
|
19 |
sentences = [sent.strip() + '.' for sent in re.split(r'(?<!\d)\.\s', text) if len(sent) > 1]
|
20 |
token_lengths = [len(tokenizer.tokenize(sent)) for sent in sentences]
|
21 |
|
|
|
22 |
chunk_size = max_token_len
|
23 |
chunks = []
|
24 |
current_chunk = []
|
|
|
38 |
|
39 |
return chunks
|
40 |
|
41 |
+
# Summarization function
|
42 |
+
def summarize_chunk(chunk, min_length=80):
|
43 |
+
inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
|
44 |
+
summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=min_length, max_length=min_length + 300)
|
45 |
+
return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
# Main processing function using parallel summarization
|
48 |
def pdf_to_text(text, PDF, min_length=80):
|
49 |
try:
|
50 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
51 |
|
|
|
52 |
if file_extension == '.docx':
|
53 |
pdf_file_path = docx_to_pdf(PDF.name)
|
54 |
text = extract_text(pdf_file_path)
|
|
|
55 |
elif file_extension == '.pdf' and text == "":
|
56 |
text = extract_text(PDF.name)
|
57 |
|
|
|
58 |
chunks = chunk_text(text)
|
59 |
summarized_text = ""
|
60 |
|
61 |
+
# Parallelize summarization using ThreadPoolExecutor
|
62 |
+
with ThreadPoolExecutor() as executor:
|
63 |
+
summaries = list(executor.map(lambda chunk: summarize_chunk(chunk, min_length), chunks))
|
64 |
+
summarized_text = "\n\n".join(summaries)
|
|
|
|
|
65 |
|
66 |
# Save summarized text to PDF
|
67 |
pdf = FPDF()
|
|
|
81 |
except Exception as e:
|
82 |
return None, f"An error occurred: {str(e)}", None
|
83 |
|
|
|
84 |
def process_sample_document(min_length=80):
|
85 |
sample_document_path = "Marbury v. Madison.pdf"
|
86 |
|
|
|
90 |
# Gradio interface
|
91 |
with gr.Blocks() as iface:
|
92 |
with gr.Row():
|
93 |
+
process_sample_button = gr.Button("Summarize Pre-Uploaded Marbury v. Madison Case Document")
|
94 |
|
95 |
text_input = gr.Textbox(label="Input Text")
|
96 |
file_input = gr.File(label="Upload PDF or DOCX")
|