lincolnlegal / app.py
Ari
Update app.py
82e6a9a verified
raw
history blame
3.19 kB
import gradio as gr
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
nltk.download('punkt')
# Load the models and tokenizers once, not every time the function is called
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# Function to convert DOCX to PDF using reportlab (UTF-8 compatible)
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
doc = Document(docx_file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
# Create a PDF and write the extracted text using reportlab
pdf = canvas.Canvas(output_pdf, pagesize=letter)
pdf.setFont("Helvetica", 12)
# Write text line by line
text = pdf.beginText(40, 750) # Start position on the page
for line in full_text:
text.textLine(line)
pdf.drawText(text)
pdf.save()
return output_pdf
# Main processing function
def pdf_to_text(text, PDF, min_length=20):
try:
# Determine whether the input is a PDF or DOCX
file_extension = os.path.splitext(PDF.name)[1].lower()
# If DOCX, first convert it to PDF
if file_extension == '.docx':
pdf_file_path = docx_to_pdf(PDF.name) # Convert DOCX to PDF
text = extract_text(pdf_file_path) # Extract text from the newly created PDF
# If PDF, extract text from it directly
elif file_extension == '.pdf' and text == "":
text = extract_text(PDF.name)
# Tokenize text
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
min_length = int(min_length)
# Generate summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
# Save summarized text to PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=output_text, align='C')
pdf_output_path = "legal.pdf"
pdf.output(pdf_output_path)
# Convert summarized text to audio
audio_output_path = "legal.wav"
tts = gTTS(text=output_text, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, output_text, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
# Gradio interface
iface = gr.Interface(
fn=pdf_to_text,
inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF or DOCX"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
)
if __name__ == "__main__":
iface.launch()