Spaces:
Runtime error
Runtime error
"""from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.lsa import LsaSummarizer | |
from sumy.summarizers.lex_rank import LexRankSummarizer | |
from sumy.summarizers.text_rank import TextRankSummarizer | |
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor | |
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer | |
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor | |
from sumy.nlp.stemmers import Stemmer | |
from sumy.utils import get_stop_words""" | |
import PyPDF2 | |
from sumy.parsers.plaintext import PlaintextParser | |
from sumy.nlp.tokenizers import Tokenizer | |
from sumy.summarizers.text_rank import TextRankSummarizer | |
def summarize_pdf_with_textrank(pdf_path, sentences_count=10): | |
""" | |
Summarizes the content of a PDF file using TextRank algorithm. | |
Args: | |
pdf_path (str): Path to the PDF file. | |
sentences_count (int): Number of sentences for the summary. | |
Returns: | |
str: Summarized text. | |
""" | |
# Extract text from the PDF | |
pdf_text = "" | |
with open(pdf_path, "rb") as pdf_file: | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page in pdf_reader.pages: | |
pdf_text += page.extract_text() or "" | |
# Check if text extraction was successful | |
if not pdf_text.strip(): | |
return "Text extraction from PDF failed or PDF is empty." | |
# Create a parser for the extracted text | |
parser = PlaintextParser.from_string(pdf_text, Tokenizer("english")) | |
# Use TextRank for summarization | |
text_rank_summarizer = TextRankSummarizer() | |
text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count) | |
# Compile summary into a single string | |
summary_text = "\n".join(str(sentence) for sentence in text_rank_summary) | |
return summary_text | |