Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer | |
import pdfplumber | |
from docx import Document | |
import textract | |
from langdetect import detect | |
# Function to detect language from the text | |
def detect_language(text): | |
try: | |
return detect(text) | |
except Exception as e: | |
print(f"Error detecting language: {e}") | |
return None | |
# Function to read document and extract text | |
def read_document(file_path): | |
text = "" | |
try: | |
if file_path.endswith('.pdf'): | |
with pdfplumber.open(file_path) as pdf: | |
for page in pdf.pages: | |
text += page.extract_text() | |
elif file_path.endswith('.docx'): | |
doc = Document(file_path) | |
for para in doc.paragraphs: | |
text += para.text | |
else: | |
text = textract.process(file_path).decode() | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
return text | |
# Function to get summarization model | |
def get_summarizer(language): | |
model_name = "facebook/bart-large-cnn" if language == "en" else "facebook/mbart-large-50" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
return pipeline("summarization", model=model, tokenizer=tokenizer) | |
# Gradio interface function to summarize the document | |
def summarize_document(file_info): | |
try: | |
# Read the uploaded file and extract text | |
file_path = file_info["name"] | |
text = read_document(file_path) | |
if not text.strip(): | |
return "The document is empty or could not be read." | |
# Detect the language of the text | |
language = detect_language(text) | |
if not language: | |
return "Language detection failed." | |
# Get the appropriate summarizer model | |
summarizer = get_summarizer(language) | |
# Generate summary | |
summary = summarizer(text, max_length=130, min_length=30, truncation=True) | |
return summary[0]['summary_text'] | |
except Exception as e: | |
# This will print the error message and the traceback | |
print(f"An error occurred: {e}") | |
traceback.print_exc() | |
return str(e) # Return the error message as output to the user | |
# Gradio app interface | |
iface = gr.Interface( | |
fn=summarize_document, | |
inputs=gr.File(label="Upload your document (PDF, DOCX, or TXT)", type="binary"), | |
outputs="text", | |
title="Document Summarizer", | |
description="Upload your document and get a summarized version of its content. Currently supports English and French." | |
) | |
# Run the Gradio app | |
iface.launch() | |