ArticlePara / summarizer.py
olusegun.odewole
first commit
8749106
raw
history blame contribute delete
No virus
6.23 kB
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import math
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords
def _create_frequency_table(text_string) -> dict:
"""
we create a dictionary for the word frequency table.
For this, we should only use the words that are not part of the stopWords array.
Removing stop words and making frequency table
Stemmer - an algorithm to bring words to its root word.
:rtype: dict
"""
stopWords = set(stopwords.words("english"))
words = word_tokenize(text_string)
ps = PorterStemmer()
freqTable = dict()
for word in words:
word = ps.stem(word)
if word in stopWords:
continue
if word in freqTable:
freqTable[word] += 1
else:
freqTable[word] = 1
return freqTable
def _create_frequency_matrix(sentences):
frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
for sent in sentences:
freq_table = {}
words = word_tokenize(sent)
for word in words:
word = word.lower()
word = ps.stem(word)
if word in stopWords:
continue
if word in freq_table:
freq_table[word] += 1
else:
freq_table[word] = 1
frequency_matrix[sent[:15]] = freq_table
return frequency_matrix
def _create_tf_matrix(freq_matrix):
tf_matrix = {}
for sent, f_table in freq_matrix.items():
tf_table = {}
count_words_in_sentence = len(f_table)
for word, count in f_table.items():
tf_table[word] = count / count_words_in_sentence
tf_matrix[sent] = tf_table
return tf_matrix
def _create_documents_per_words(freq_matrix):
word_per_doc_table = {}
for sent, f_table in freq_matrix.items():
for word, count in f_table.items():
if word in word_per_doc_table:
word_per_doc_table[word] += 1
else:
word_per_doc_table[word] = 1
return word_per_doc_table
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
idf_matrix = {}
for sent, f_table in freq_matrix.items():
idf_table = {}
for word in f_table.keys():
idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))
idf_matrix[sent] = idf_table
return idf_matrix
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
tf_idf_matrix = {}
for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):
tf_idf_table = {}
for (word1, value1), (word2, value2) in zip(f_table1.items(),
f_table2.items()): # here, keys are the same in both the table
tf_idf_table[word1] = float(value1 * value2)
tf_idf_matrix[sent1] = tf_idf_table
return tf_idf_matrix
def _score_sentences(tf_idf_matrix) -> dict:
"""
score a sentence by its word's TF
Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence.
:rtype: dict
"""
sentenceValue = {}
for sent, f_table in tf_idf_matrix.items():
total_score_per_sentence = 0
count_words_in_sentence = len(f_table)
for word, score in f_table.items():
total_score_per_sentence += score
sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence
return sentenceValue
def _find_average_score(sentenceValue) -> int:
"""
Find the average score from the sentence value dictionary
:rtype: int
"""
sumValues = 0
for entry in sentenceValue:
sumValues += sentenceValue[entry]
# Average value of a sentence from original summary_text
average = (sumValues / len(sentenceValue))
return average
def _generate_summary(sentences, sentenceValue, threshold):
sentence_count = 0
summary = ''
for sentence in sentences:
if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold):
summary += " " + sentence
sentence_count += 1
return summary
def run_summarization(text):
"""
:param text: Plain summary_text of long article
:return: summarized summary_text
"""
'''
We already have a sentence tokenizer, so we just need
to run the sent_tokenize() method to create the array of sentences.
'''
# 1 Sentence Tokenize
sentences = sent_tokenize(text)
total_documents = len(sentences)
#print(sentences)
# 2 Create the Frequency matrix of the words in each sentence.
freq_matrix = _create_frequency_matrix(sentences)
#print(freq_matrix)
'''
Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document.
'''
# 3 Calculate TermFrequency and generate a matrix
tf_matrix = _create_tf_matrix(freq_matrix)
#print(tf_matrix)
# 4 creating table for documents per words
count_doc_per_words = _create_documents_per_words(freq_matrix)
#print(count_doc_per_words)
'''
Inverse document frequency (IDF) is how unique or rare a word is.
'''
# 5 Calculate IDF and generate a matrix
idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents)
#print(idf_matrix)
# 6 Calculate TF-IDF and generate a matrix
tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
#print(tf_idf_matrix)
# 7 Important Algorithm: score the sentences
sentence_scores = _score_sentences(tf_idf_matrix)
#print(sentence_scores)
# 8 Find the threshold
threshold = _find_average_score(sentence_scores)
#print(threshold)
# 9 Important Algorithm: Generate the summary
summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold)
return summary
#usage = run_summarization(text_str)
# def text_summarize(ARTICLE, maxLength, minLength):
# output = summarizer(ARTICLE)[0]['summary_text']
# ans = text_paraphrase(output)
# return ans