Spaces:
Sleeping
Sleeping
Ari
commited on
Commit
•
deb55dd
1
Parent(s):
c0d316e
Update app.py
Browse files
app.py
CHANGED
@@ -9,28 +9,40 @@ from docx import Document
|
|
9 |
from reportlab.lib.pagesizes import letter
|
10 |
from reportlab.pdfgen import canvas
|
11 |
|
12 |
-
#
|
13 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
14 |
-
model = AutoModelForSeq2SeqLM.from_pretrained("
|
15 |
|
16 |
-
# Function to chunk text
|
17 |
def chunk_text(text, max_token_len=512):
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
return chunks
|
23 |
|
24 |
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
25 |
doc = Document(docx_file)
|
26 |
-
full_text = []
|
27 |
-
for para in doc.paragraphs:
|
28 |
-
full_text.append(para.text)
|
29 |
|
30 |
pdf = canvas.Canvas(output_pdf, pagesize=letter)
|
31 |
pdf.setFont("Helvetica", 12)
|
32 |
-
|
33 |
text = pdf.beginText(40, 750)
|
|
|
34 |
for line in full_text:
|
35 |
text.textLine(line)
|
36 |
|
@@ -38,13 +50,13 @@ def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
|
38 |
pdf.save()
|
39 |
return output_pdf
|
40 |
|
41 |
-
# Summarize each chunk
|
42 |
def summarize_chunk(chunk, min_length=50, max_length=150):
|
43 |
-
inputs =
|
44 |
-
summary_ids = model.generate(inputs["input_ids"], num_beams=
|
45 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
46 |
|
47 |
-
# Main processing function
|
48 |
def pdf_to_text(text, PDF, min_length=50):
|
49 |
try:
|
50 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
@@ -55,33 +67,31 @@ def pdf_to_text(text, PDF, min_length=50):
|
|
55 |
elif file_extension == '.pdf' and text == "":
|
56 |
text = extract_text(PDF.name)
|
57 |
|
58 |
-
# Split text into token-based chunks
|
59 |
chunks = chunk_text(text)
|
60 |
-
|
61 |
-
|
62 |
-
#
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
# Save summarized text to PDF
|
67 |
pdf = FPDF()
|
68 |
pdf.add_page()
|
69 |
pdf.set_font("Times", size=12)
|
70 |
-
pdf.multi_cell(190, 10, txt=
|
71 |
pdf_output_path = "legal.pdf"
|
72 |
pdf.output(pdf_output_path)
|
73 |
|
74 |
# Convert summarized text to audio
|
75 |
audio_output_path = "legal.wav"
|
76 |
-
tts = gTTS(text=
|
77 |
tts.save(audio_output_path)
|
78 |
|
79 |
-
return audio_output_path,
|
80 |
|
81 |
except Exception as e:
|
82 |
return None, f"An error occurred: {str(e)}", None
|
83 |
|
84 |
-
# Preloaded document processor
|
85 |
def process_sample_document(min_length=50):
|
86 |
sample_document_path = "Marbury v. Madison.pdf"
|
87 |
|
|
|
9 |
from reportlab.lib.pagesizes import letter
|
10 |
from reportlab.pdfgen import canvas
|
11 |
|
12 |
+
# Use LegalBERT for handling legal documents
|
13 |
+
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
14 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("nlpaueb/legal-bert-base-uncased")
|
15 |
|
16 |
+
# Function to chunk the text into manageable pieces
|
17 |
def chunk_text(text, max_token_len=512):
|
18 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
19 |
+
chunks = []
|
20 |
+
current_chunk = []
|
21 |
+
current_length = 0
|
22 |
+
|
23 |
+
for sentence in sentences:
|
24 |
+
tokens = tokenizer.tokenize(sentence)
|
25 |
+
if current_length + len(tokens) <= max_token_len:
|
26 |
+
current_chunk.append(sentence)
|
27 |
+
current_length += len(tokens)
|
28 |
+
else:
|
29 |
+
chunks.append(" ".join(current_chunk))
|
30 |
+
current_chunk = [sentence]
|
31 |
+
current_length = len(tokens)
|
32 |
+
|
33 |
+
if current_chunk:
|
34 |
+
chunks.append(" ".join(current_chunk))
|
35 |
+
|
36 |
return chunks
|
37 |
|
38 |
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
|
39 |
doc = Document(docx_file)
|
40 |
+
full_text = [para.text for para in doc.paragraphs]
|
|
|
|
|
41 |
|
42 |
pdf = canvas.Canvas(output_pdf, pagesize=letter)
|
43 |
pdf.setFont("Helvetica", 12)
|
|
|
44 |
text = pdf.beginText(40, 750)
|
45 |
+
|
46 |
for line in full_text:
|
47 |
text.textLine(line)
|
48 |
|
|
|
50 |
pdf.save()
|
51 |
return output_pdf
|
52 |
|
53 |
+
# Summarize each chunk and then recursively summarize the summaries
|
54 |
def summarize_chunk(chunk, min_length=50, max_length=150):
|
55 |
+
inputs = tokenizer([chunk], max_length=512, truncation=True, return_tensors="pt")
|
56 |
+
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=max_length)
|
57 |
return tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
58 |
|
59 |
+
# Main processing function using recursive summarization
|
60 |
def pdf_to_text(text, PDF, min_length=50):
|
61 |
try:
|
62 |
file_extension = os.path.splitext(PDF.name)[1].lower()
|
|
|
67 |
elif file_extension == '.pdf' and text == "":
|
68 |
text = extract_text(PDF.name)
|
69 |
|
|
|
70 |
chunks = chunk_text(text)
|
71 |
+
summarized_chunks = [summarize_chunk(chunk, min_length=min_length) for chunk in chunks]
|
72 |
+
|
73 |
+
# Combine summaries and recursively summarize the combined text
|
74 |
+
summarized_text = " ".join(summarized_chunks)
|
75 |
+
final_summary = summarize_chunk(summarized_text, min_length=min_length, max_length=min_length+150)
|
76 |
+
|
77 |
# Save summarized text to PDF
|
78 |
pdf = FPDF()
|
79 |
pdf.add_page()
|
80 |
pdf.set_font("Times", size=12)
|
81 |
+
pdf.multi_cell(190, 10, txt=final_summary, align='C')
|
82 |
pdf_output_path = "legal.pdf"
|
83 |
pdf.output(pdf_output_path)
|
84 |
|
85 |
# Convert summarized text to audio
|
86 |
audio_output_path = "legal.wav"
|
87 |
+
tts = gTTS(text=final_summary, lang='en', slow=False)
|
88 |
tts.save(audio_output_path)
|
89 |
|
90 |
+
return audio_output_path, final_summary, pdf_output_path
|
91 |
|
92 |
except Exception as e:
|
93 |
return None, f"An error occurred: {str(e)}", None
|
94 |
|
|
|
95 |
def process_sample_document(min_length=50):
|
96 |
sample_document_path = "Marbury v. Madison.pdf"
|
97 |
|