import gradio as gr import pickle import pandas as pd import numpy as np import re import nltk from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity import networkx as nx nltk.download('punkt') nltk.download('stopwords') def greet(name): return "Hello " + name + "!!" # Load cleaned_word_embeddings with open("cleaned_word_embeddings.pkl", "rb") as f: cleaned_word_embeddings = pickle.load(f) def get_clean_sentences(text): sentences = sent_tokenize(text) # Remove punctuations, numbers and special characters cleaned_sentences = [] for sentence in sentences: cleaned_sentence = re.sub(r"\\.|[^\\'\w ]", " ", sentence) cleaned_sentences.append(cleaned_sentence) return cleaned_sentences def filter_sentences(text): cleaned_sentences = get_clean_sentences(text) # Remove stopwords stop_words = set(stopwords.words("english")) filtered_sentences = [] for sentence in cleaned_sentences: words = nltk.word_tokenize(sentence) filtered_sentence = " ".join( [word for word in words if word.lower() not in stop_words] ) filtered_sentences.append(filtered_sentence) return filtered_sentences def get_vector_representation(text): filtered_sentences = filter_sentences(text) # Get vector representations for each sentence in the articles sentence_vectors = [] for sentence in filtered_sentences: words = sentence.split() sentence_vector = np.zeros((25,)) if len(words) != 0: for word in words: if word in cleaned_word_embeddings: sentence_vector += cleaned_word_embeddings[word] sentence_vector /= len(words) sentence_vectors.append(sentence_vector) return sentence_vectors def calculate_cosine_similarity(sentence_vectors): flat_sentence_vectors = np.array( [vec for sublist in sentence_vectors for vec in sublist] ).reshape(1, -1) # Calculate cosine similarity similarity_matrix = cosine_similarity(sentence_vectors) return similarity_matrix def get_scores(similarity_matrix): # Create a graph from the similarity matrix nx_graph = nx.from_numpy_array(similarity_matrix) # Get scores scores = nx.pagerank(nx_graph) return scores def rank_sentences(text): sentence_vectors = get_vector_representation(text) similarity_matrix = calculate_cosine_similarity(sentence_vectors) scores = get_scores(similarity_matrix) ranked_sentences = sorted( ((scores[j], sentence) for j, sentence in enumerate(sent_tokenize(text))), reverse=True, ) return ranked_sentences def summarize(text): ranked_sentences = rank_sentences(text) summary = "" for j in range(len(ranked_sentences)//10): summary += ranked_sentences[j][1] + " " return summary demo = gr.Interface(fn=summarize, inputs="text", outputs="text") demo.launch(share=True)