Spaces:
Running
Running
import numpy as np | |
import pandas as pd | |
import string | |
from unidecode import unidecode | |
from collections import Counter | |
class TextPreprocessor: | |
def __init__(self, remove_punct: bool = True, remove_digits: bool = True, | |
remove_stop_words: bool = True, | |
remove_short_words: bool = False, minlen: int = 1, maxlen: int = 1, top_p: float = None, | |
bottom_p: float = None): | |
self.remove_punct = remove_punct | |
self.remove_digits = remove_digits | |
self.remove_stop_words = remove_stop_words | |
self.remove_short_words = remove_short_words | |
self.minlen = minlen | |
self.maxlen = maxlen | |
self.top_p = top_p | |
self.bottom_p = bottom_p | |
self.words_to_remove = [] | |
self.stop_words = ["'d", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", | |
'about', | |
'above', | |
'across', | |
'after', | |
'afterwards', | |
'again', | |
'against', | |
'ain', | |
'all', | |
'almost', | |
'alone', | |
'along', | |
'already', | |
'also', | |
'although', | |
'always', | |
'am', | |
'among', | |
'amongst', | |
'amount', | |
'an', | |
'and', | |
'another', | |
'any', | |
'anyhow', | |
'anyone', | |
'anything', | |
'anyway', | |
'anywhere', | |
'are', | |
'around', | |
'as', | |
'at', | |
'back', | |
'be', | |
'became', | |
'because', | |
'become', | |
'becomes', | |
'becoming', | |
'been', | |
'before', | |
'beforehand', | |
'behind', | |
'being', | |
'below', | |
'beside', | |
'besides', | |
'between', | |
'beyond', | |
'both', | |
'bottom', | |
'but', | |
'by', | |
'ca', | |
'call', | |
'can', | |
'cannot', | |
'could', | |
'couldn', | |
"couldn't", | |
'd', | |
'did', | |
'do', | |
'does', | |
'doing', | |
'done', | |
'down', | |
'due', | |
'during', | |
'each', | |
'eight', | |
'either', | |
'eleven', | |
'else', | |
'elsewhere', | |
'empty', | |
'enough', | |
'even', | |
'ever', | |
'every', | |
'everyone', | |
'everything', | |
'everywhere', | |
'except', | |
'few', | |
'fifteen', | |
'fifty', | |
'first', | |
'five', | |
'for', | |
'former', | |
'formerly', | |
'forty', | |
'four', | |
'from', | |
'front', | |
'full', | |
'further', | |
'get', | |
'give', | |
'go', | |
'had', | |
'has', | |
'have', | |
'having', | |
'he', | |
'hence', | |
'her', | |
'here', | |
'hereafter', | |
'hereby', | |
'herein', | |
'hereupon', | |
'hers', | |
'herself', | |
'him', | |
'himself', | |
'his', | |
'how', | |
'however', | |
'hundred', | |
'i', | |
'if', | |
'in', | |
'indeed', | |
'into', | |
'is', | |
'it', | |
"it's", | |
'its', | |
'itself', | |
'just', | |
'keep', | |
'last', | |
'latter', | |
'latterly', | |
'least', | |
'less', | |
'll', | |
'm', | |
'ma', | |
'made', | |
'make', | |
'many', | |
'say', | |
'said', | |
'says', | |
'told', | |
'tell', | |
'may', | |
'me', | |
'meanwhile', | |
'might', | |
'mine', | |
'more', | |
'moreover', | |
'most', | |
'mostly', | |
'move', | |
'much', | |
'must', | |
'my', | |
'myself', | |
'name', | |
'namely', | |
'neither', | |
'never', | |
'nevertheless', | |
'next', | |
'nine', | |
'no', | |
'nobody', | |
'none', | |
'noone', | |
'nor', | |
'not', | |
'nothing', | |
'now', | |
'nowhere', | |
'o', | |
'of', | |
'off', | |
'often', | |
'on', | |
'once', | |
'one', | |
'only', | |
'onto', | |
'or', | |
'other', | |
'others', | |
'otherwise', | |
'our', | |
'ours', | |
'ourselves', | |
'out', | |
'over', | |
'own', | |
'part', | |
'per', | |
'perhaps', | |
'please', | |
'put', | |
'quite', | |
'rather', | |
're', | |
'rs', | |
'really', | |
'regarding', | |
's', | |
'same', | |
'say', | |
'see', | |
'seem', | |
'seemed', | |
'seeming', | |
'seems', | |
'serious', | |
'several', | |
'shan', | |
"shan't", | |
'she', | |
"she's", | |
'should', | |
"should've", | |
'shouldn', | |
"shouldn't", | |
'show', | |
'side', | |
'since', | |
'six', | |
'sixty', | |
'so', | |
'some', | |
'somehow', | |
'someone', | |
'something', | |
'sometime', | |
'sometimes', | |
'somewhere', | |
'still', | |
'such', | |
't', | |
'take', | |
'ten', | |
'than', | |
'that', | |
"that'll", | |
'the', | |
'their', | |
'theirs', | |
'them', | |
'themselves', | |
'then', | |
'thence', | |
'there', | |
'thereafter', | |
'thereby', | |
'therefore', | |
'therein', | |
'thereupon', | |
'these', | |
'they', | |
'third', | |
'this', | |
'those', | |
'though', | |
'three', | |
'through', | |
'throughout', | |
'thru', | |
'thus', | |
'to', | |
'together', | |
'too', | |
'top', | |
'toward', | |
'towards', | |
'twelve', | |
'twenty', | |
'two', | |
'under', | |
'unless', | |
'until', | |
'up', | |
'upon', | |
'us', | |
'used', | |
'using', | |
'various', | |
've', | |
'very', | |
'via', | |
'was', | |
'wasn', | |
"wasn't", | |
'we', | |
'well', | |
'were', | |
'weren', | |
"weren't", | |
'what', | |
'whatever', | |
'when', | |
'whence', | |
'whenever', | |
'where', | |
'whereafter', | |
'whereas', | |
'whereby', | |
'wherein', | |
'whereupon', | |
'wherever', | |
'whether', | |
'which', | |
'while', | |
'whither', | |
'who', | |
'whoever', | |
'whole', | |
'whom', | |
'whose', | |
'why', | |
'will', | |
'with', | |
'within', | |
'without', | |
'won', | |
"won't", | |
'would', | |
'wouldn', | |
"wouldn't", | |
'y', | |
'yet', | |
'you', | |
"you'd", | |
"you'll", | |
"you're", | |
"you've", | |
'your', | |
'yours', | |
'yourself', | |
'yourselves', | |
'‘d', | |
'‘ll', | |
'‘m', | |
'‘re', | |
'‘s', | |
'‘ve', | |
'’d', | |
'’ll', | |
'’m', | |
'’re', | |
'new', | |
'old', | |
'’s', | |
'’ve'] | |
self.contraction_to_expansion = {"ain't": "am not", | |
"aren't": "are not", | |
"can't": "cannot", | |
"can't've": "cannot have", | |
"'cause": "because", | |
"could've": "could have", | |
"couldn't": "could not", | |
"couldn't've": "could not have", | |
"didn't": "did not", | |
"doesn't": "does not", | |
"don't": "do not", | |
"hadn't": "had not", | |
"hadn't've": "had not have", | |
"hasn't": "has not", | |
"haven't": "have not", | |
"he'd": "he would", | |
"he'd've": "he would have", | |
"he'll": "he will", | |
"he'll've": "he will have", | |
"he's": "he is", | |
"how'd": "how did", | |
"how'd'y": "how do you", | |
"how'll": "how will", | |
"how's": "how is", | |
"i'd": "i would", | |
"i'd've": "i would have", | |
"i'll": "i will", | |
"i'll've": "i will have", | |
"i'm": "i am", | |
"i've": "i have", | |
"isn't": "is not", | |
"it'd": "it had", | |
"it'd've": "it would have", | |
"it'll": "it will", | |
"it'll've": "it will have", | |
"it's": "it is", | |
"let's": "let us", | |
"ma'am": "madam", | |
"mayn't": "may not", | |
"might've": "might have", | |
"mightn't": "might not", | |
"mightn't've": "might not have", | |
"must've": "must have", | |
"mustn't": "must not", | |
"mustn't've": "must not have", | |
"needn't": "need not", | |
"needn't've": "need not have", | |
"o'clock": "of the clock", | |
"oughtn't": "ought not", | |
"oughtn't've": "ought not have", | |
"shan't": "shall not", | |
"sha'n't": "shall not", | |
"shan't've": "shall not have", | |
"she'd": "she would", | |
"she'd've": "she would have", | |
"she'll": "she will", | |
"she'll've": "she will have", | |
"she's": "she is", | |
"should've": "should have", | |
"shouldn't": "should not", | |
"shouldn't've": "should not have", | |
"so've": "so have", | |
"so's": "so is", | |
"that'd": "that would", | |
"that'd've": "that would have", | |
"that's": "that is", | |
"there'd": "there had", | |
"there'd've": "there would have", | |
"there's": "there is", | |
"they'd": "they would", | |
"they'd've": "they would have", | |
"they'll": "they will", | |
"they'll've": "they will have", | |
"they're": "they are", | |
"they've": "they have", | |
"to've": "to have", | |
"wasn't": "was not", | |
"we'd": "we had", | |
"we'd've": "we would have", | |
"we'll": "we will", | |
"we'll've": "we will have", | |
"we're": "we are", | |
"we've": "we have", | |
"weren't": "were not", | |
"what'll": "what will", | |
"what'll've": "what will have", | |
"what're": "what are", | |
"what's": "what is", | |
"what've": "what have", | |
"when's": "when is", | |
"when've": "when have", | |
"where'd": "where did", | |
"where's": "where is", | |
"where've": "where have", | |
"who'll": "who will", | |
"who'll've": "who will have", | |
"who's": "who is", | |
"who've": "who have", | |
"why's": "why is", | |
"why've": "why have", | |
"will've": "will have", | |
"won't": "will not", | |
"won't've": "will not have", | |
"would've": "would have", | |
"wouldn't": "would not", | |
"wouldn't've": "would not have", | |
"y'all": "you all", | |
"y'alls": "you alls", | |
"y'all'd": "you all would", | |
"y'all'd've": "you all would have", | |
"y'all're": "you all are", | |
"y'all've": "you all have", | |
"you'd": "you had", | |
"you'd've": "you would have", | |
"you'll": "you you will", | |
"you'll've": "you you will have", | |
"you're": "you are", | |
"you've": "you have" | |
} | |
def __remove_double_whitespaces(string: str): | |
return " ".join(string.split()) | |
def __remove_url(self, string_series: pd.Series): | |
""" | |
Removes URLs m text | |
:param string_series: pd.Series, input string series | |
:return: pd.Series, cleaned string series | |
""" | |
clean_string_series = string_series.str.replace( | |
pat=r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", | |
repl=" ", regex=True).copy() | |
return clean_string_series.map(self.__remove_double_whitespaces) | |
def __expand(self, string_series: pd.Series): | |
""" | |
Replaces contractions with expansions. eg. don't wit do not. | |
:param string_series: pd.Series, input string series | |
:return: pd.Series, cleaned string series | |
""" | |
clean_string_series = string_series.copy() | |
for c, e in self.contraction_to_expansion.items(): | |
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy() | |
return clean_string_series.map(self.__remove_double_whitespaces) | |
def __remove_punct(self, string_series: pd.Series): | |
""" | |
Removes punctuations from the input string. | |
:param string_series: pd.Series, input string series | |
:return: pd.Series, cleaned string series | |
""" | |
clean_string_series = string_series.copy() | |
puncts = [r'\n', r'\r', r'\t'] | |
puncts.extend(list(string.punctuation)) | |
for i in puncts: | |
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy() | |
return clean_string_series.map(self.__remove_double_whitespaces) | |
def __remove_digits(self, string_series: pd.Series): | |
""" | |
Removes digits from the input string. | |
:param string_series: pd.Series, input string series | |
:return: pd.Series, cleaned string series | |
""" | |
clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy() | |
return clean_string_series.map(self.__remove_double_whitespaces) | |
def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1): | |
""" | |
Reomves words/tokens where minlen <= len <= maxlen. | |
:param string_series: pd.Series, input string series | |
:param minlen: int, minimum length of token to be removed. | |
:param maxlen: int, maximum length of token to be removed. | |
:return: pd.Series, cleaned string series | |
""" | |
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if | |
(len(word) > maxlen) or (len(word) < minlen)])) | |
return clean_string_series | |
def __remove_stop_words(self, string_series: pd.Series): | |
""" | |
Removes stop words from the input string. | |
:param string_series: pd.Series, input string series | |
:return: pd.Series, cleaned string series | |
""" | |
def str_remove_stop_words(string: str): | |
stops = self.stop_words | |
return " ".join([token for token in string.split() if token not in stops]) | |
return string_series.map(str_remove_stop_words) | |
def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None, | |
bottom_p: int = None, dataset: str = 'train'): | |
""" | |
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words. | |
:param string_series: pd.Series, input string series | |
:param top_p: float, percent of frequent words to remove. | |
:param bottom_p: float, percent of rare words to remove. | |
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set. | |
:return: pd.Series, cleaned string series | |
""" | |
if dataset == 'train': | |
if top_p is None: | |
top_p = 0 | |
if bottom_p is None: | |
bottom_p = 0 | |
if top_p > 0 or bottom_p > 0: | |
word_freq = pd.Series(" ".join(string_series).split()).value_counts() | |
n_words = len(word_freq) | |
if top_p > 0: | |
self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]]) | |
if bottom_p > 0: | |
self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]]) | |
if len(self.words_to_remove) == 0: | |
return string_series | |
else: | |
clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() | |
if word not in self.words_to_remove])) | |
return clean_string_series | |
def preprocess(self, string_series: pd.Series, dataset: str = "train"): | |
""" | |
Entry point. | |
:param string_series: pd.Series, input string series | |
:param dataset: str, "train" for training set, "tesrt" for val/dev/test set. | |
:return: pd.Series, cleaned string series | |
""" | |
string_series = string_series.str.lower().copy() | |
string_series = string_series.map(unidecode).copy() | |
string_series = self.__remove_url(string_series=string_series) | |
string_series = self.__expand(string_series=string_series) | |
if self.remove_punct: | |
string_series = self.__remove_punct(string_series=string_series) | |
if self.remove_digits: | |
string_series = self.__remove_digits(string_series=string_series) | |
if self.remove_stop_words: | |
string_series = self.__remove_stop_words(string_series=string_series) | |
if self.remove_short_words: | |
string_series = self.__remove_short_words(string_series=string_series, | |
minlen=self.minlen, | |
maxlen=self.maxlen) | |
string_series = self.__remove_top_bottom_words(string_series=string_series, | |
top_p=self.top_p, | |
bottom_p=self.bottom_p, dataset=dataset) | |
string_series = string_series.str.strip().copy() | |
string_series.replace(to_replace="", value="this is an empty message", inplace=True) | |
return string_series | |
def get_frequent_words_html(df): | |
text_preprocess = TextPreprocessor() | |
preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description']) | |
counter = Counter(' '.join([*preprocessed_txt]).split()) | |
freq_tokens_html = '<div class="word-cloud-container">' | |
n = 1 | |
for i, j in counter.most_common(25): | |
freq_tokens_html += f'<a class="wc-tokens" onclick=wc_search("{i}")>{i}</a>{" " * np.random.randint(3, 7, 1)[0]}' | |
if n == 5: | |
freq_tokens_html += '<div class="word-cloud-section" id="word-cloud-section-id">' | |
n += 1 | |
freq_tokens_html += '</div></div>' | |
return freq_tokens_html |