RyhanSunny's picture
Update app.py
7dfdb33 verified
import gradio as gr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import pdfplumber
from docx import Document
import textract
from langdetect import detect
# Function to detect language from the text
def detect_language(text):
try:
return detect(text)
except Exception as e:
print(f"Error detecting language: {e}")
return None
# Function to read document and extract text
def read_document(file_path):
text = ""
try:
if file_path.endswith('.pdf'):
with pdfplumber.open(file_path) as pdf:
for page in pdf.pages:
text += page.extract_text()
elif file_path.endswith('.docx'):
doc = Document(file_path)
for para in doc.paragraphs:
text += para.text
else:
text = textract.process(file_path).decode()
except Exception as e:
print(f"An error occurred: {e}")
return text
# Function to get summarization model
def get_summarizer(language):
model_name = "facebook/bart-large-cnn" if language == "en" else "facebook/mbart-large-50"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
return pipeline("summarization", model=model, tokenizer=tokenizer)
# Gradio interface function to summarize the document
def summarize_document(file_info):
try:
# Read the uploaded file and extract text
file_path = file_info["name"]
text = read_document(file_path)
if not text.strip():
return "The document is empty or could not be read."
# Detect the language of the text
language = detect_language(text)
if not language:
return "Language detection failed."
# Get the appropriate summarizer model
summarizer = get_summarizer(language)
# Generate summary
summary = summarizer(text, max_length=130, min_length=30, truncation=True)
return summary[0]['summary_text']
except Exception as e:
# This will print the error message and the traceback
print(f"An error occurred: {e}")
traceback.print_exc()
return str(e) # Return the error message as output to the user
# Gradio app interface
iface = gr.Interface(
fn=summarize_document,
inputs=gr.File(label="Upload your document (PDF, DOCX, or TXT)", type="binary"),
outputs="text",
title="Document Summarizer",
description="Upload your document and get a summarized version of its content. Currently supports English and French."
)
# Run the Gradio app
iface.launch()