|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sumy.parsers.plaintext import PlaintextParser |
|
from sumy.nlp.tokenizers import Tokenizer |
|
from sumy.nlp.stemmers import Stemmer |
|
from sumy.summarizers.lsa import LsaSummarizer |
|
from sumy.summarizers.text_rank import TextRankSummarizer |
|
from sumy.summarizers.reduction import ReductionSummarizer |
|
from sumy.utils import get_stop_words |
|
import numpy as np |
|
import nltk |
|
nltk.download("punkt") |
|
def summary_with_tfidf(text , num_summary_sentence=3): |
|
sentences = nltk.tokenize.sent_tokenize(text) |
|
tfidfvectorizer = TfidfVectorizer() |
|
words_tfidf = tfidfvectorizer.fit_transform(sentences) |
|
|
|
sent_sum = words_tfidf.sum(axis=1) |
|
extractive_sentence = np.argsort(sent_sum , axis=0)[::-1] |
|
|
|
text_summaries = [] |
|
for i in range(0, len(sentences)): |
|
if i in extractive_sentence[:num_summary_sentence]: |
|
text_summaries.append(sentences[i]) |
|
return "\n\n".join(text_summaries) |
|
|
|
|
|
|
|
def summary_with_lsa(text , num_summary_sentence=3): |
|
language = 'arabic' |
|
stemmer = Stemmer(language) |
|
tokenizer = Tokenizer(language) |
|
|
|
parser = PlaintextParser.from_string(text , tokenizer) |
|
summarizer = LsaSummarizer(stemmer) |
|
summarizer.stop_words = get_stop_words(language) |
|
|
|
text_summary = [] |
|
|
|
for extractive_sentence in summarizer(parser.document , sentences_count=num_summary_sentence): |
|
text_summary.append(str(extractive_sentence)) |
|
|
|
|
|
return "\n\n".join(text_summary) |
|
|
|
def summary_with_text_rank(text , num_summary_sentence=3): |
|
language = 'arabic' |
|
stemmer = Stemmer(language) |
|
tokenizer = Tokenizer(language) |
|
parser = PlaintextParser.from_string(text , tokenizer) |
|
summarizer = TextRankSummarizer(stemmer) |
|
summarizer.stop_words = get_stop_words(language) |
|
|
|
text_summary = [] |
|
|
|
for extractive_sentence in summarizer(parser.document , sentences_count=num_summary_sentence): |
|
text_summary.append(str(extractive_sentence)) |
|
|
|
return "\n\n".join(text_summary) |
|
|
|
def summary_with_text_reduction(text , num_summary_sentence=3): |
|
language = 'arabic' |
|
stemmer = Stemmer(language) |
|
tokenizer = Tokenizer(language) |
|
parser = PlaintextParser.from_string(text , tokenizer) |
|
summarizer = ReductionSummarizer(stemmer) |
|
summarizer.stop_words = get_stop_words(language) |
|
|
|
text_summary = [] |
|
|
|
for extractive_sentence in summarizer(parser.document , sentences_count=num_summary_sentence): |
|
text_summary.append(str(extractive_sentence)) |
|
|
|
return "\n\n".join(text_summary) |
|
|