Spaces:
Running
Running
File size: 3,192 Bytes
17e34a5 94bf427 3813c2d 94bf427 ac28e59 82e6a9a cffceba 94bf427 11ef927 94bf427 11ef927 82e6a9a ac28e59 82e6a9a ec8c26c 82e6a9a ac28e59 94bf427 7f2b3e5 ac28e59 94bf427 ac28e59 170c2bc 94bf427 170c2bc 94bf427 170c2bc 94bf427 170c2bc 7f2b3e5 94bf427 170c2bc 94bf427 7f2b3e5 94bf427 7f2b3e5 94bf427 ac28e59 94bf427 7f2b3e5 0751294 342a4a2 ac28e59 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
nltk.download('punkt')
# Load the models and tokenizers once, not every time the function is called
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
# Function to convert DOCX to PDF using reportlab (UTF-8 compatible)
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
doc = Document(docx_file)
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
# Create a PDF and write the extracted text using reportlab
pdf = canvas.Canvas(output_pdf, pagesize=letter)
pdf.setFont("Helvetica", 12)
# Write text line by line
text = pdf.beginText(40, 750) # Start position on the page
for line in full_text:
text.textLine(line)
pdf.drawText(text)
pdf.save()
return output_pdf
# Main processing function
def pdf_to_text(text, PDF, min_length=20):
try:
# Determine whether the input is a PDF or DOCX
file_extension = os.path.splitext(PDF.name)[1].lower()
# If DOCX, first convert it to PDF
if file_extension == '.docx':
pdf_file_path = docx_to_pdf(PDF.name) # Convert DOCX to PDF
text = extract_text(pdf_file_path) # Extract text from the newly created PDF
# If PDF, extract text from it directly
elif file_extension == '.pdf' and text == "":
text = extract_text(PDF.name)
# Tokenize text
inputs = tokenizer([text], max_length=1024, return_tensors="pt")
min_length = int(min_length)
# Generate summary
summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
# Save summarized text to PDF
pdf = FPDF()
pdf.add_page()
pdf.set_font("Times", size=12)
pdf.multi_cell(190, 10, txt=output_text, align='C')
pdf_output_path = "legal.pdf"
pdf.output(pdf_output_path)
# Convert summarized text to audio
audio_output_path = "legal.wav"
tts = gTTS(text=output_text, lang='en', slow=False)
tts.save(audio_output_path)
return audio_output_path, output_text, pdf_output_path
except Exception as e:
return None, f"An error occurred: {str(e)}", None
# Gradio interface
iface = gr.Interface(
fn=pdf_to_text,
inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF or DOCX"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
)
if __name__ == "__main__":
iface.launch()
|