sairamn's picture
Update app.py (#2)
86f9d89 verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from PyPDF2 import PdfReader
from fpdf import FPDF
from deep_translator import GoogleTranslator
import tempfile
# Load the tokenizer and model for summarization
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-led-base-16384")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-led-base-16384")
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ''
for page in reader.pages:
text += page.extract_text()
return text
# Function to summarize text
def summarize_text(text, max_input_length=16384, max_summary_length=512):
text = ' '.join(text.split())
inputs = tokenizer(text, return_tensors="pt", max_length=max_input_length, truncation=True)
summary_ids = model.generate(
inputs["input_ids"],
max_length=max_summary_length,
min_length=150,
length_penalty=2.0,
num_beams=5,
early_stopping=True,
no_repeat_ngram_size=3
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return summary
# Function to save summary to PDF
def save_summary_to_pdf(summary_text, output_pdf_path):
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.set_font("Arial", 'B', 16)
pdf.cell(200, 10, txt="Legal Document Summary", ln=True, align="C")
pdf.set_font("Arial", size=12)
lines = summary_text.splitlines()
for line in lines:
pdf.multi_cell(0, 10, line)
pdf.output(output_pdf_path)
return output_pdf_path
# Function to process PDF for summarization
def process_pdf_summary(pdf_file):
pdf_text = extract_text_from_pdf(pdf_file)
summary = summarize_text(pdf_text)
output_pdf_path = "legal_document_summary.pdf"
return save_summary_to_pdf(summary, output_pdf_path)
# Function to translate text
def translate_text(text, language):
translation = GoogleTranslator(source='auto', target=language).translate(text)
return translation
# Function to create a PDF from translated text
def create_pdf(text):
pdf_output = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf").name
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12) # Use Arial instead of Latha
for line in text.split('\n'):
pdf.cell(200, 10, txt=line.encode('latin-1', 'replace').decode('latin-1'), ln=True)
pdf.output(pdf_output)
return pdf_output
# Function to process PDF for translation
def process_pdf_translation(pdf_file, language):
extracted_text = extract_text_from_pdf(pdf_file)
translated_text = translate_text(extracted_text, language)
translated_pdf = create_pdf(translated_text)
return translated_pdf, translated_text
# Create Gradio interface
with gr.Blocks() as app:
gr.Markdown("# Legal Document Translator and Summarizer")
gr.Markdown("Choose an operation to perform on your document:")
with gr.Tab("Summarization"):
gr.Markdown("### Upload PDF for Summarization")
pdf_input_summary = gr.File(label="Upload PDF Document")
summary_output = gr.File(label="Download Summary PDF")
summarize_button = gr.Button("Summarize")
summarize_button.click(process_pdf_summary, inputs=pdf_input_summary, outputs=summary_output)
with gr.Tab("Translation"):
gr.Markdown("### Upload PDF for Translation")
pdf_input_translation = gr.File(label="Upload PDF Document")
language_options = ["hi", "ta", "ml", "en"] # Hindi, Tamil, Malayalam, English
language_selector = gr.Dropdown(choices=language_options, label="Select Language", value="hi")
translation_output = gr.File(label="Download Translated PDF")
translated_text_output = gr.Textbox(label="Translated Text", lines=10)
translate_button = gr.Button("Translate")
translate_button.click(process_pdf_translation, inputs=[pdf_input_translation, language_selector],
outputs=[translation_output, translated_text_output])
# Launch the app
if __name__ == "__main__":
app.launch()