|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
from PyPDF2 import PdfReader |
|
from fpdf import FPDF |
|
from deep_translator import GoogleTranslator |
|
import tempfile |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-led-base-16384") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-led-base-16384") |
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
reader = PdfReader(pdf_file) |
|
text = '' |
|
for page in reader.pages: |
|
text += page.extract_text() |
|
return text |
|
|
|
|
|
|
|
def summarize_text(text, max_input_length=16384, max_summary_length=512): |
|
text = ' '.join(text.split()) |
|
inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True) |
|
|
|
summary_ids = model.generate( |
|
inputs["input_ids"], |
|
max_length=max_summary_length, |
|
min_length=150, |
|
length_penalty=2.0, |
|
num_beams=5, |
|
early_stopping=True, |
|
no_repeat_ngram_size=3 |
|
) |
|
|
|
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) |
|
return summary |
|
|
|
|
|
|
|
def save_summary_to_pdf(summary_text, output_pdf_path): |
|
pdf = FPDF() |
|
pdf.set_auto_page_break(auto=True, margin=15) |
|
pdf.add_page() |
|
pdf.set_font("Arial", size=12) |
|
pdf.set_font("Arial", 'B', 16) |
|
pdf.cell(200, 10, txt="Legal Document Summary", ln=True, align="C") |
|
pdf.set_font("Arial", size=12) |
|
lines = summary_text.splitlines() |
|
|
|
for line in lines: |
|
pdf.multi_cell(0, 10, line) |
|
|
|
pdf.output(output_pdf_path) |
|
return output_pdf_path |
|
|
|
|
|
|
|
def process_pdf_summary(pdf_file): |
|
pdf_text = extract_text_from_pdf(pdf_file) |
|
summary = summarize_text(pdf_text) |
|
output_pdf_path = "legal_document_summary.pdf" |
|
return save_summary_to_pdf(summary, output_pdf_path) |
|
|
|
|
|
|
|
def translate_text(text, language): |
|
translation = GoogleTranslator(source='auto', target=language).translate(text) |
|
return translation |
|
|
|
|
|
|
|
def create_pdf(text): |
|
pdf_output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name |
|
pdf = FPDF() |
|
pdf.add_page() |
|
pdf.set_font("Arial", size=12) |
|
|
|
for line in text.split('\n'): |
|
pdf.cell(200, 10, txt=line.encode('latin-1', 'replace').decode('latin-1'), ln=True) |
|
|
|
pdf.output(pdf_output) |
|
return pdf_output |
|
|
|
|
|
|
|
def process_pdf_translation(pdf_file, language): |
|
extracted_text = extract_text_from_pdf(pdf_file) |
|
translated_text = translate_text(extracted_text, language) |
|
translated_pdf = create_pdf(translated_text) |
|
return translated_pdf, translated_text |
|
|
|
|
|
|
|
with gr.Blocks() as app: |
|
gr.Markdown("# Legal Document Translator and Summarizer") |
|
gr.Markdown("Choose an operation to perform on your document:") |
|
|
|
with gr.Tab("Summarization"): |
|
gr.Markdown("### Upload PDF for Summarization") |
|
pdf_input_summary = gr.File(label="Upload PDF Document") |
|
summary_output = gr.File(label="Download Summary PDF") |
|
summarize_button = gr.Button("Summarize") |
|
|
|
summarize_button.click(process_pdf_summary, inputs=pdf_input_summary, outputs=summary_output) |
|
|
|
with gr.Tab("Translation"): |
|
gr.Markdown("### Upload PDF for Translation") |
|
pdf_input_translation = gr.File(label="Upload PDF Document") |
|
language_options = ["hi", "ta", "ml", "en"] |
|
language_selector = gr.Dropdown(choices=language_options, label="Select Language", value="hi") |
|
translation_output = gr.File(label="Download Translated PDF") |
|
translated_text_output = gr.Textbox(label="Translated Text", lines=10) |
|
translate_button = gr.Button("Translate") |
|
|
|
translate_button.click(process_pdf_translation, inputs=[pdf_input_translation, language_selector], |
|
outputs=[translation_output, translated_text_output]) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |