Spaces:

ksvmuralidhar
/

news_aggregator

Running

App Files Files Community

news_aggregator / word_cloud.py

ksvmuralidhar

Upload files

078c1e1 verified 3 months ago

raw history blame

No virus

19.4 kB

	import numpy as np
	import pandas as pd
	import string
	from unidecode import unidecode
	from collections import Counter


	class TextPreprocessor:
	def __init__(self, remove_punct: bool = True, remove_digits: bool = True,
	remove_stop_words: bool = True,
	remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None,
	bottom_p: float = None):
	self.remove_punct = remove_punct
	self.remove_digits = remove_digits
	self.remove_stop_words = remove_stop_words
	self.remove_short_words = remove_short_words
	self.minlen = minlen
	self.maxlen = maxlen
	self.top_p = top_p
	self.bottom_p = bottom_p
	self.words_to_remove = []
	self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
	'about',
	'above',
	'across',
	'after',
	'afterwards',
	'again',
	'against',
	'ain',
	'all',
	'almost',
	'alone',
	'along',
	'already',
	'also',
	'although',
	'always',
	'am',
	'among',
	'amongst',
	'amount',
	'an',
	'and',
	'another',
	'any',
	'anyhow',
	'anyone',
	'anything',
	'anyway',
	'anywhere',
	'are',
	'around',
	'as',
	'at',
	'back',
	'be',
	'became',
	'because',
	'become',
	'becomes',
	'becoming',
	'been',
	'before',
	'beforehand',
	'behind',
	'being',
	'below',
	'beside',
	'besides',
	'between',
	'beyond',
	'both',
	'bottom',
	'but',
	'by',
	'ca',
	'call',
	'can',
	'cannot',
	'could',
	'couldn',
	"couldn't",
	'd',
	'did',
	'do',
	'does',
	'doing',
	'done',
	'down',
	'due',
	'during',
	'each',
	'eight',
	'either',
	'eleven',
	'else',
	'elsewhere',
	'empty',
	'enough',
	'even',
	'ever',
	'every',
	'everyone',
	'everything',
	'everywhere',
	'except',
	'few',
	'fifteen',
	'fifty',
	'first',
	'five',
	'for',
	'former',
	'formerly',
	'forty',
	'four',
	'from',
	'front',
	'full',
	'further',
	'get',
	'give',
	'go',
	'had',
	'has',
	'have',
	'having',
	'he',
	'hence',
	'her',
	'here',
	'hereafter',
	'hereby',
	'herein',
	'hereupon',
	'hers',
	'herself',
	'him',
	'himself',
	'his',
	'how',
	'however',
	'hundred',
	'i',
	'if',
	'in',
	'indeed',
	'into',
	'is',
	'it',
	"it's",
	'its',
	'itself',
	'just',
	'keep',
	'last',
	'latter',
	'latterly',
	'least',
	'less',
	'll',
	'm',
	'ma',
	'made',
	'make',
	'many',
	'say',
	'said',
	'says',
	'told',
	'tell',
	'may',
	'me',
	'meanwhile',
	'might',
	'mine',
	'more',
	'moreover',
	'most',
	'mostly',
	'move',
	'much',
	'must',
	'my',
	'myself',
	'name',
	'namely',
	'neither',
	'never',
	'nevertheless',
	'next',
	'nine',
	'no',
	'nobody',
	'none',
	'noone',
	'nor',
	'not',
	'nothing',
	'now',
	'nowhere',
	'o',
	'of',
	'off',
	'often',
	'on',
	'once',
	'one',
	'only',
	'onto',
	'or',
	'other',
	'others',
	'otherwise',
	'our',
	'ours',
	'ourselves',
	'out',
	'over',
	'own',
	'part',
	'per',
	'perhaps',
	'please',
	'put',
	'quite',
	'rather',
	're',
	'rs',
	'really',
	'regarding',
	's',
	'same',
	'say',
	'see',
	'seem',
	'seemed',
	'seeming',
	'seems',
	'serious',
	'several',
	'shan',
	"shan't",
	'she',
	"she's",
	'should',
	"should've",
	'shouldn',
	"shouldn't",
	'show',
	'side',
	'since',
	'six',
	'sixty',
	'so',
	'some',
	'somehow',
	'someone',
	'something',
	'sometime',
	'sometimes',
	'somewhere',
	'still',
	'such',
	't',
	'take',
	'ten',
	'than',
	'that',
	"that'll",
	'the',
	'their',
	'theirs',
	'them',
	'themselves',
	'then',
	'thence',
	'there',
	'thereafter',
	'thereby',
	'therefore',
	'therein',
	'thereupon',
	'these',
	'they',
	'third',
	'this',
	'those',
	'though',
	'three',
	'through',
	'throughout',
	'thru',
	'thus',
	'to',
	'together',
	'too',
	'top',
	'toward',
	'towards',
	'twelve',
	'twenty',
	'two',
	'under',
	'unless',
	'until',
	'up',
	'upon',
	'us',
	'used',
	'using',
	'various',
	've',
	'very',
	'via',
	'was',
	'wasn',
	"wasn't",
	'we',
	'well',
	'were',
	'weren',
	"weren't",
	'what',
	'whatever',
	'when',
	'whence',
	'whenever',
	'where',
	'whereafter',
	'whereas',
	'whereby',
	'wherein',
	'whereupon',
	'wherever',
	'whether',
	'which',
	'while',
	'whither',
	'who',
	'whoever',
	'whole',
	'whom',
	'whose',
	'why',
	'will',
	'with',
	'within',
	'without',
	'won',
	"won't",
	'would',
	'wouldn',
	"wouldn't",
	'y',
	'yet',
	'you',
	"you'd",
	"you'll",
	"you're",
	"you've",
	'your',
	'yours',
	'yourself',
	'yourselves',
	'‘d',
	'‘ll',
	'‘m',
	'‘re',
	'‘s',
	'‘ve',
	'’d',
	'’ll',
	'’m',
	'’re',
	'new',
	'old',
	'’s',
	'’ve']

	self.contraction_to_expansion = {"ain't": "am not",
	"aren't": "are not",
	"can't": "cannot",
	"can't've": "cannot have",
	"'cause": "because",
	"could've": "could have",
	"couldn't": "could not",
	"couldn't've": "could not have",
	"didn't": "did not",
	"doesn't": "does not",
	"don't": "do not",
	"hadn't": "had not",
	"hadn't've": "had not have",
	"hasn't": "has not",
	"haven't": "have not",
	"he'd": "he would",
	"he'd've": "he would have",
	"he'll": "he will",
	"he'll've": "he will have",
	"he's": "he is",
	"how'd": "how did",
	"how'd'y": "how do you",
	"how'll": "how will",
	"how's": "how is",
	"i'd": "i would",
	"i'd've": "i would have",
	"i'll": "i will",
	"i'll've": "i will have",
	"i'm": "i am",
	"i've": "i have",
	"isn't": "is not",
	"it'd": "it had",
	"it'd've": "it would have",
	"it'll": "it will",
	"it'll've": "it will have",
	"it's": "it is",
	"let's": "let us",
	"ma'am": "madam",
	"mayn't": "may not",
	"might've": "might have",
	"mightn't": "might not",
	"mightn't've": "might not have",
	"must've": "must have",
	"mustn't": "must not",
	"mustn't've": "must not have",
	"needn't": "need not",
	"needn't've": "need not have",
	"o'clock": "of the clock",
	"oughtn't": "ought not",
	"oughtn't've": "ought not have",
	"shan't": "shall not",
	"sha'n't": "shall not",
	"shan't've": "shall not have",
	"she'd": "she would",
	"she'd've": "she would have",
	"she'll": "she will",
	"she'll've": "she will have",
	"she's": "she is",
	"should've": "should have",
	"shouldn't": "should not",
	"shouldn't've": "should not have",
	"so've": "so have",
	"so's": "so is",
	"that'd": "that would",
	"that'd've": "that would have",
	"that's": "that is",
	"there'd": "there had",
	"there'd've": "there would have",
	"there's": "there is",
	"they'd": "they would",
	"they'd've": "they would have",
	"they'll": "they will",
	"they'll've": "they will have",
	"they're": "they are",
	"they've": "they have",
	"to've": "to have",
	"wasn't": "was not",
	"we'd": "we had",
	"we'd've": "we would have",
	"we'll": "we will",
	"we'll've": "we will have",
	"we're": "we are",
	"we've": "we have",
	"weren't": "were not",
	"what'll": "what will",
	"what'll've": "what will have",
	"what're": "what are",
	"what's": "what is",
	"what've": "what have",
	"when's": "when is",
	"when've": "when have",
	"where'd": "where did",
	"where's": "where is",
	"where've": "where have",
	"who'll": "who will",
	"who'll've": "who will have",
	"who's": "who is",
	"who've": "who have",
	"why's": "why is",
	"why've": "why have",
	"will've": "will have",
	"won't": "will not",
	"won't've": "will not have",
	"would've": "would have",
	"wouldn't": "would not",
	"wouldn't've": "would not have",
	"y'all": "you all",
	"y'alls": "you alls",
	"y'all'd": "you all would",
	"y'all'd've": "you all would have",
	"y'all're": "you all are",
	"y'all've": "you all have",
	"you'd": "you had",
	"you'd've": "you would have",
	"you'll": "you you will",
	"you'll've": "you you will have",
	"you're": "you are",
	"you've": "you have"
	}

	@staticmethod
	def __remove_double_whitespaces(string: str):
	return " ".join(string.split())

	def __remove_url(self, string_series: pd.Series):
	"""
	Removes URLs m text
	:param string_series: pd.Series, input string series
	:return: pd.Series, cleaned string series
	"""
	clean_string_series = string_series.str.replace(
	pat=r"(https?:\/\/(?:www\.\|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}\|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}\|https?:\/\/(?:www\.\|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}\|www\.[a-zA-Z0-9]+\.[^\s]{2,})",
	repl=" ", regex=True).copy()
	return clean_string_series.map(self.__remove_double_whitespaces)

	def __expand(self, string_series: pd.Series):
	"""
	Replaces contractions with expansions. eg. don't wit do not.
	:param string_series: pd.Series, input string series
	:return: pd.Series, cleaned string series
	"""
	clean_string_series = string_series.copy()
	for c, e in self.contraction_to_expansion.items():
	clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
	return clean_string_series.map(self.__remove_double_whitespaces)

	def __remove_punct(self, string_series: pd.Series):
	"""
	Removes punctuations from the input string.
	:param string_series: pd.Series, input string series
	:return: pd.Series, cleaned string series
	"""
	clean_string_series = string_series.copy()
	puncts = [r'\n', r'\r', r'\t']
	puncts.extend(list(string.punctuation))
	for i in puncts:
	clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
	return clean_string_series.map(self.__remove_double_whitespaces)

	def __remove_digits(self, string_series: pd.Series):
	"""
	Removes digits from the input string.
	:param string_series: pd.Series, input string series
	:return: pd.Series, cleaned string series
	"""
	clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
	return clean_string_series.map(self.__remove_double_whitespaces)

	@staticmethod
	def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
	"""
	Reomves words/tokens where minlen <= len <= maxlen.
	:param string_series: pd.Series, input string series
	:param minlen: int, minimum length of token to be removed.
	:param maxlen: int, maximum length of token to be removed.
	:return: pd.Series, cleaned string series
	"""
	clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
	(len(word) > maxlen) or (len(word) < minlen)]))
	return clean_string_series

	def __remove_stop_words(self, string_series: pd.Series):
	"""
	Removes stop words from the input string.
	:param string_series: pd.Series, input string series
	:return: pd.Series, cleaned string series
	"""
	def str_remove_stop_words(string: str):
	stops = self.stop_words
	return " ".join([token for token in string.split() if token not in stops])

	return string_series.map(str_remove_stop_words)

	def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
	bottom_p: int = None, dataset: str = 'train'):
	"""
	Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
	:param string_series: pd.Series, input string series
	:param top_p: float, percent of frequent words to remove.
	:param bottom_p: float, percent of rare words to remove.
	:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
	:return: pd.Series, cleaned string series
	"""
	if dataset == 'train':
	if top_p is None:
	top_p = 0
	if bottom_p is None:
	bottom_p = 0

	if top_p > 0 or bottom_p > 0:
	word_freq = pd.Series(" ".join(string_series).split()).value_counts()
	n_words = len(word_freq)

	if top_p > 0:
	self.words_to_remove.extend([word_freq.index[: int(np.ceil(top_p n_words))]])

	if bottom_p > 0:
	self.words_to_remove.extend([word_freq.index[-int(np.ceil(bottom_p n_words)):]])

	if len(self.words_to_remove) == 0:
	return string_series
	else:
	clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
	if word not in self.words_to_remove]))
	return clean_string_series

	def preprocess(self, string_series: pd.Series, dataset: str = "train"):
	"""
	Entry point.
	:param string_series: pd.Series, input string series
	:param dataset: str, "train" for training set, "tesrt" for val/dev/test set.
	:return: pd.Series, cleaned string series
	"""
	string_series = string_series.str.lower().copy()
	string_series = string_series.map(unidecode).copy()
	string_series = self.__remove_url(string_series=string_series)
	string_series = self.__expand(string_series=string_series)

	if self.remove_punct:
	string_series = self.__remove_punct(string_series=string_series)
	if self.remove_digits:
	string_series = self.__remove_digits(string_series=string_series)
	if self.remove_stop_words:
	string_series = self.__remove_stop_words(string_series=string_series)
	if self.remove_short_words:
	string_series = self.__remove_short_words(string_series=string_series,
	minlen=self.minlen,
	maxlen=self.maxlen)
	string_series = self.__remove_top_bottom_words(string_series=string_series,
	top_p=self.top_p,
	bottom_p=self.bottom_p, dataset=dataset)

	string_series = string_series.str.strip().copy()
	string_series.replace(to_replace="", value="this is an empty message", inplace=True)

	return string_series


	def get_frequent_words_html(df):
	text_preprocess = TextPreprocessor()
	preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
	counter = Counter(' '.join([*preprocessed_txt]).split())

	freq_tokens_html = '<div class="word-cloud-container">'
	n = 1
	for i, j in counter.most_common(25):
	freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}'
	if n == 5:
	freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">'
	n += 1
	freq_tokens_html += '</div></div>'
	return freq_tokens_html