File size: 1,857 Bytes
46193fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words"""

import PyPDF2
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarize_pdf_with_textrank(pdf_path, sentences_count=10):
    """
    Summarizes the content of a PDF file using TextRank algorithm.

    Args:
    pdf_path (str): Path to the PDF file.
    sentences_count (int): Number of sentences for the summary.

    Returns:
    str: Summarized text.
    """

    # Extract text from the PDF
    pdf_text = ""
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            pdf_text += page.extract_text() or ""

    # Check if text extraction was successful
    if not pdf_text.strip():
        return "Text extraction from PDF failed or PDF is empty."

    # Create a parser for the extracted text
    parser = PlaintextParser.from_string(pdf_text, Tokenizer("english"))

    # Use TextRank for summarization
    text_rank_summarizer = TextRankSummarizer()
    text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count)

    # Compile summary into a single string
    summary_text = "\n".join(str(sentence) for sentence in text_rank_summary)
    
    return summary_text