File size: 3,192 Bytes
17e34a5
94bf427
 
 
 
3813c2d
94bf427
ac28e59
82e6a9a
 
cffceba
94bf427
11ef927
94bf427
 
 
11ef927
82e6a9a
ac28e59
 
 
 
 
 
82e6a9a
 
 
 
 
 
 
 
ec8c26c
82e6a9a
 
ac28e59
 
94bf427
 
7f2b3e5
ac28e59
 
 
 
 
 
 
 
 
94bf427
ac28e59
170c2bc
 
 
94bf427
170c2bc
 
 
94bf427
 
 
 
 
170c2bc
94bf427
 
 
 
 
170c2bc
7f2b3e5
94bf427
170c2bc
94bf427
7f2b3e5
 
 
94bf427
7f2b3e5
94bf427
ac28e59
94bf427
7f2b3e5
0751294
342a4a2
ac28e59
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import gradio as gr
import os
import nltk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from fpdf import FPDF
from gtts import gTTS
from pdfminer.high_level import extract_text
from docx import Document
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas

nltk.download('punkt')

# Load the models and tokenizers once, not every time the function is called
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Function to convert DOCX to PDF using reportlab (UTF-8 compatible)
def docx_to_pdf(docx_file, output_pdf="converted_doc.pdf"):
    doc = Document(docx_file)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    
    # Create a PDF and write the extracted text using reportlab
    pdf = canvas.Canvas(output_pdf, pagesize=letter)
    pdf.setFont("Helvetica", 12)
    
    # Write text line by line
    text = pdf.beginText(40, 750)  # Start position on the page
    for line in full_text:
        text.textLine(line)
    
    pdf.drawText(text)
    pdf.save()
    return output_pdf

# Main processing function
def pdf_to_text(text, PDF, min_length=20):
    try:
        # Determine whether the input is a PDF or DOCX
        file_extension = os.path.splitext(PDF.name)[1].lower()
        
        # If DOCX, first convert it to PDF
        if file_extension == '.docx':
            pdf_file_path = docx_to_pdf(PDF.name)  # Convert DOCX to PDF
            text = extract_text(pdf_file_path)  # Extract text from the newly created PDF
        # If PDF, extract text from it directly
        elif file_extension == '.pdf' and text == "":
            text = extract_text(PDF.name)
        
        # Tokenize text
        inputs = tokenizer([text], max_length=1024, return_tensors="pt")
        min_length = int(min_length)
        
        # Generate summary
        summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=min_length, max_length=min_length+1000)
        output_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
        
        # Save summarized text to PDF
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Times", size=12)
        pdf.multi_cell(190, 10, txt=output_text, align='C')
        pdf_output_path = "legal.pdf"
        pdf.output(pdf_output_path)
        
        # Convert summarized text to audio
        audio_output_path = "legal.wav"
        tts = gTTS(text=output_text, lang='en', slow=False)
        tts.save(audio_output_path)
        
        return audio_output_path, output_text, pdf_output_path
    
    except Exception as e:
        return None, f"An error occurred: {str(e)}", None

# Gradio interface
iface = gr.Interface(
    fn=pdf_to_text,
    inputs=[gr.Textbox(label="Input Text"), gr.File(label="Upload PDF or DOCX"), gr.Slider(minimum=10, maximum=100, step=10, value=20, label="Summary Minimum Length")],
    outputs=[gr.Audio(label="Generated Audio"), gr.Textbox(label="Generated Summary"), gr.File(label="Summary PDF")]
)

if __name__ == "__main__":
    iface.launch()