Spaces:
Runtime error
Runtime error
File size: 6,229 Bytes
8749106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
def _create_frequency_table(text_string) -> dict:
"""
we create a dictionary for the word frequency table.
For this, we should only use the words that are not part of the stopWords array.
Removing stop words and making frequency table
Stemmer - an algorithm to bring words to its root word.
:rtype: dict
"""
stopWords = set(stopwords.words("english"))
words = word_tokenize(text_string)
ps = PorterStemmer()
freqTable = dict()
for word in words:
word = ps.stem(word)
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
return freqTable
def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
for sent in sentences:
freq_table = {}
words = word_tokenize(sent)
for word in words:
word = word.lower()
word = ps.stem(word)
if word in stopWords:
continue
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent[:15]] = freq_table
return frequency_matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
for sent, f_table in freq_matrix.items():
tf_table = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}
for sent, f_table in freq_matrix.items():
for word, count in f_table.items():
if word in word_per_doc_table:
word_per_doc_table[word] += 1
else:
word_per_doc_table[word] = 1
return word_per_doc_table
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
idf_matrix = {}
for sent, f_table in freq_matrix.items():
idf_table = {}
for word in f_table.keys():
idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
idf_matrix[sent] = idf_table
return idf_matrix
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
tf_idf_matrix = {}
for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
tf_idf_table = {}
for (word1, value1), (word2, value2) in zip(f_table1.items(),
f_table2.items()): # here, keys are the same in both the table
tf_idf_table[word1] = float(value1 * value2)
tf_idf_matrix[sent1] = tf_idf_table
return tf_idf_matrix
def _score_sentences(tf_idf_matrix) -> dict:
"""
score a sentence by its word's TF
Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
:rtype: dict
"""
sentenceValue = {}
for sent, f_table in tf_idf_matrix.items():
total_score_per_sentence = 0
count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
return sentenceValue
def _find_average_score(sentenceValue) -> int:
"""
Find the average score from the sentence value dictionary
:rtype: int
"""
sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original summary_text
average = (sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
def run_summarization(text):
"""
:param text: Plain summary_text of long article
:return: summarized summary_text
"""
'''
We already have a sentence tokenizer, so we just need
to run the sent_tokenize() method to create the array of sentences.
'''
# 1 Sentence Tokenize
sentences = sent_tokenize(text)
total_documents = len(sentences)
#print(sentences)
# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(sentences)
#print(freq_matrix)
'''
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
#print(tf_matrix)
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
#print(count_doc_per_words)
'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
#print(idf_matrix)
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
#print(tf_idf_matrix)
# 7 Important Algorithm: score the sentences
sentence_scores = _score_sentences(tf_idf_matrix)
#print(sentence_scores)
# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
#print(threshold)
# 9 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
return summary
#usage = run_summarization(text_str)
# def text_summarize(ARTICLE, maxLength, minLength):
# output = summarizer(ARTICLE)[0]['summary_text']
# ans = text_paraphrase(output)
# return ans
|