Spaces:
Sleeping
Sleeping
File size: 3,977 Bytes
17e34a5 94bf427 3813c2d 94bf427 ac28e59 82e6a9a cffceba 636fcfd fcb7e65 c2c4472 11ef927 94bf427 11ef927 9d0e6a8 b74e4b8 9d0e6a8 ac28e59 82e6a9a ec7dfc2 82e6a9a ec8c26c 82e6a9a ac28e59 9d0e6a8 7f2b3e5 ac28e59 ec7dfc2 ac28e59 94bf427 ac28e59 9d0e6a8 94bf427 9d0e6a8 94bf427 9d0e6a8 7f2b3e5 94bf427 9d0e6a8 94bf427 7f2b3e5 d9d6a38 f6ccaae 6de4d60 f6ccaae 6de4d60 f6ccaae d9d6a38 f6ccaae 9d0e6a8 f6ccaae 6de4d60 342a4a2 ac28e59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
# Clear any potentially corrupted data and ensure correct download
nltk.data.path.append("/home/user/nltk_data")
nltk.download('punkt')
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# Function to split text into chunks
def split_text(text, max_chunk_size=1024):
sentences = nltk.sent_tokenize(text) # Use NLTK's sentence tokenizer
chunks = []
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) <= max_chunk_size:
chunk += sentence + " "
else:
chunks.append(chunk.strip())
chunk = sentence + " "
if chunk:
chunks.append(chunk.strip())
return chunks
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
doc = Document(docx_file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
pdf = canvas.Canvas(output_pdf, pagesize=letter)
pdf.setFont("Helvetica", 12)
text = pdf.beginText(40, 750)
for line in full_text:
text.textLine(line)
pdf.drawText(text)
pdf.save()
return output_pdf
# Main processing function with text chunking
def pdf_to_text(text, PDF, min_length=80):
try:
file_extension = os.path.splitext(PDF.name)[1].lower()
if file_extension == '.docx':
pdf_file_path = docx_to_pdf(PDF.name)
text = extract_text(pdf_file_path)
elif file_extension == '.pdf' and text == "":
text = extract_text(PDF.name)
chunks = split_text(text)
summarized_text = ""
for chunk in chunks:
inputs = tokenizer([chunk], max_length=1024, truncation=True, return_tensors="pt")
min_length = int(min_length)
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length + 400)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
summarized_text += output_text + "\n\n"
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=summarized_text, align='C')
pdf_output_path = "legal.pdf"
pdf.output(pdf_output_path)
audio_output_path = "legal.wav"
tts = gTTS(text=summarized_text, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, summarized_text, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
def process_sample_document(min_length=80):
sample_document_path = "Marbury v. Madison.pdf"
with open(sample_document_path, "rb") as f:
return pdf_to_text("", f, min_length)
with gr.Blocks() as iface:
with gr.Row():
process_sample_button = gr.Button("Summarize Marbury v. Madison Case Pre-Uploaded")
text_input = gr.Textbox(label="Input Text")
file_input = gr.File(label="Upload PDF or DOCX")
slider = gr.Slider(minimum=10, maximum=400, step=10, value=80, label="Summary Minimum Length")
audio_output = gr.Audio(label="Generated Audio")
summary_output = gr.Textbox(label="Generated Summary")
pdf_output = gr.File(label="Summary PDF")
process_sample_button.click(fn=process_sample_document, inputs=slider, outputs=[audio_output, summary_output, pdf_output])
file_input.change(fn=pdf_to_text, inputs=[text_input, file_input, slider], outputs=[audio_output, summary_output, pdf_output])
if __name__ == "__main__":
iface.launch()
|