import nltk nltk.download('punkt') nltk.download('stopwords') import math from nltk import sent_tokenize, word_tokenize, PorterStemmer from nltk.corpus import stopwords def _create_frequency_table(text_string) -> dict: """ we create a dictionary for the word frequency table. For this, we should only use the words that are not part of the stopWords array. Removing stop words and making frequency table Stemmer - an algorithm to bring words to its root word. :rtype: dict """ stopWords = set(stopwords.words("english")) words = word_tokenize(text_string) ps = PorterStemmer() freqTable = dict() for word in words: word = ps.stem(word) if word in stopWords: continue if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 return freqTable def _create_frequency_matrix(sentences): frequency_matrix = {} stopWords = set(stopwords.words("english")) ps = PorterStemmer() for sent in sentences: freq_table = {} words = word_tokenize(sent) for word in words: word = word.lower() word = ps.stem(word) if word in stopWords: continue if word in freq_table: freq_table[word] += 1 else: freq_table[word] = 1 frequency_matrix[sent[:15]] = freq_table return frequency_matrix def _create_tf_matrix(freq_matrix): tf_matrix = {} for sent, f_table in freq_matrix.items(): tf_table = {} count_words_in_sentence = len(f_table) for word, count in f_table.items(): tf_table[word] = count / count_words_in_sentence tf_matrix[sent] = tf_table return tf_matrix def _create_documents_per_words(freq_matrix): word_per_doc_table = {} for sent, f_table in freq_matrix.items(): for word, count in f_table.items(): if word in word_per_doc_table: word_per_doc_table[word] += 1 else: word_per_doc_table[word] = 1 return word_per_doc_table def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents): idf_matrix = {} for sent, f_table in freq_matrix.items(): idf_table = {} for word in f_table.keys(): idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word])) idf_matrix[sent] = idf_table return idf_matrix def _create_tf_idf_matrix(tf_matrix, idf_matrix): tf_idf_matrix = {} for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()): tf_idf_table = {} for (word1, value1), (word2, value2) in zip(f_table1.items(), f_table2.items()): # here, keys are the same in both the table tf_idf_table[word1] = float(value1 * value2) tf_idf_matrix[sent1] = tf_idf_table return tf_idf_matrix def _score_sentences(tf_idf_matrix) -> dict: """ score a sentence by its word's TF Basic algorithm: adding the TF frequency of every non-stop word in a sentence divided by total no of words in a sentence. :rtype: dict """ sentenceValue = {} for sent, f_table in tf_idf_matrix.items(): total_score_per_sentence = 0 count_words_in_sentence = len(f_table) for word, score in f_table.items(): total_score_per_sentence += score sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence return sentenceValue def _find_average_score(sentenceValue) -> int: """ Find the average score from the sentence value dictionary :rtype: int """ sumValues = 0 for entry in sentenceValue: sumValues += sentenceValue[entry] # Average value of a sentence from original summary_text average = (sumValues / len(sentenceValue)) return average def _generate_summary(sentences, sentenceValue, threshold): sentence_count = 0 summary = '' for sentence in sentences: if sentence[:15] in sentenceValue and sentenceValue[sentence[:15]] >= (threshold): summary += " " + sentence sentence_count += 1 return summary def run_summarization(text): """ :param text: Plain summary_text of long article :return: summarized summary_text """ ''' We already have a sentence tokenizer, so we just need to run the sent_tokenize() method to create the array of sentences. ''' # 1 Sentence Tokenize sentences = sent_tokenize(text) total_documents = len(sentences) #print(sentences) # 2 Create the Frequency matrix of the words in each sentence. freq_matrix = _create_frequency_matrix(sentences) #print(freq_matrix) ''' Term frequency (TF) is how often a word appears in a document, divided by how many words are there in a document. ''' # 3 Calculate TermFrequency and generate a matrix tf_matrix = _create_tf_matrix(freq_matrix) #print(tf_matrix) # 4 creating table for documents per words count_doc_per_words = _create_documents_per_words(freq_matrix) #print(count_doc_per_words) ''' Inverse document frequency (IDF) is how unique or rare a word is. ''' # 5 Calculate IDF and generate a matrix idf_matrix = _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents) #print(idf_matrix) # 6 Calculate TF-IDF and generate a matrix tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix) #print(tf_idf_matrix) # 7 Important Algorithm: score the sentences sentence_scores = _score_sentences(tf_idf_matrix) #print(sentence_scores) # 8 Find the threshold threshold = _find_average_score(sentence_scores) #print(threshold) # 9 Important Algorithm: Generate the summary summary = _generate_summary(sentences, sentence_scores, 1.3 * threshold) return summary #usage = run_summarization(text_str) # def text_summarize(ARTICLE, maxLength, minLength): # output = summarizer(ARTICLE)[0]['summary_text'] # ans = text_paraphrase(output) # return ans