ToS-Summarization / extractive_model.py
EmreYY20
integrate extracive model in streamlit
46193fd
raw
history blame
1.86 kB
"""from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer
from pysummarization.nlpbase.auto_abstractor import AutoAbstractor
from pysummarization.tokenizabledoc.simple_tokenizer import SimpleTokenizer
from pysummarization.abstractabledoc.top_n_rank_abstractor import TopNRankAbstractor
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words"""
import PyPDF2
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
def summarize_pdf_with_textrank(pdf_path, sentences_count=10):
"""
Summarizes the content of a PDF file using TextRank algorithm.
Args:
pdf_path (str): Path to the PDF file.
sentences_count (int): Number of sentences for the summary.
Returns:
str: Summarized text.
"""
# Extract text from the PDF
pdf_text = ""
with open(pdf_path, "rb") as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page in pdf_reader.pages:
pdf_text += page.extract_text() or ""
# Check if text extraction was successful
if not pdf_text.strip():
return "Text extraction from PDF failed or PDF is empty."
# Create a parser for the extracted text
parser = PlaintextParser.from_string(pdf_text, Tokenizer("english"))
# Use TextRank for summarization
text_rank_summarizer = TextRankSummarizer()
text_rank_summary = text_rank_summarizer(parser.document, sentences_count=sentences_count)
# Compile summary into a single string
summary_text = "\n".join(str(sentence) for sentence in text_rank_summary)
return summary_text