Mohamed-Sami commited on
Commit
7647552
1 Parent(s): b998a05

Create summarization_methods.py

Browse files
Files changed (1) hide show
  1. summarization_methods.py +74 -0
summarization_methods.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import TfidfVectorizer
2
+ from sumy.parsers.plaintext import PlaintextParser
3
+ from sumy.nlp.tokenizers import Tokenizer
4
+ from sumy.nlp.stemmers import Stemmer
5
+ from sumy.summarizers.lsa import LsaSummarizer
6
+ from sumy.summarizers.text_rank import TextRankSummarizer
7
+ from sumy.summarizers.reduction import ReductionSummarizer
8
+ from sumy.utils import get_stop_words
9
+ import numpy as np
10
+ import nltk
11
+ nltk.download("punkt")
12
+ def summary_with_tfidf(text , num_summary_sentence=3):
13
+ sentences = nltk.tokenize.sent_tokenize(text)
14
+ tfidfvectorizer = TfidfVectorizer()
15
+ words_tfidf = tfidfvectorizer.fit_transform(sentences)
16
+ #print(sentences)
17
+ sent_sum = words_tfidf.sum(axis=1)
18
+ extractive_sentence = np.argsort(sent_sum , axis=0)[::-1]
19
+
20
+ text_summaries = []
21
+ for i in range(0, len(sentences)):
22
+ if i in extractive_sentence[:num_summary_sentence]:
23
+ text_summaries.append(sentences[i])
24
+ return "\n\n".join(text_summaries)
25
+
26
+
27
+
28
+ def summary_with_lsa(text , num_summary_sentence=3):
29
+ language = 'arabic'
30
+ stemmer = Stemmer(language)
31
+ tokenizer = Tokenizer(language)
32
+
33
+ parser = PlaintextParser.from_string(text , tokenizer)
34
+ summarizer = LsaSummarizer(stemmer)
35
+ summarizer.stop_words = get_stop_words(language)
36
+
37
+ text_summary = []
38
+
39
+ for extractive_sentence in summarizer(parser.document , sentences_count=num_summary_sentence):
40
+ text_summary.append(str(extractive_sentence))
41
+
42
+
43
+ return "\n\n".join(text_summary)
44
+
45
+ def summary_with_text_rank(text , num_summary_sentence=3):
46
+ language = 'arabic'
47
+ stemmer = Stemmer(language)
48
+ tokenizer = Tokenizer(language)
49
+ parser = PlaintextParser.from_string(text , tokenizer)
50
+ summarizer = TextRankSummarizer(stemmer)
51
+ summarizer.stop_words = get_stop_words(language)
52
+
53
+ text_summary = []
54
+
55
+ for extractive_sentence in summarizer(parser.document , sentences_count=num_summary_sentence):
56
+ text_summary.append(str(extractive_sentence))
57
+
58
+ return "\n\n".join(text_summary)
59
+
60
+ def summary_with_text_reduction(text , num_summary_sentence=3):
61
+ language = 'arabic'
62
+ stemmer = Stemmer(language)
63
+ tokenizer = Tokenizer(language)
64
+ parser = PlaintextParser.from_string(text , tokenizer)
65
+ summarizer = ReductionSummarizer(stemmer)
66
+ summarizer.stop_words = get_stop_words(language)
67
+
68
+ text_summary = []
69
+
70
+ for extractive_sentence in summarizer(parser.document , sentences_count=num_summary_sentence):
71
+ text_summary.append(str(extractive_sentence))
72
+
73
+ return "\n\n".join(text_summary)
74
+