Spaces:
Running
Running
Update word_cloud.py
Browse files- word_cloud.py +17 -17
word_cloud.py
CHANGED
@@ -503,7 +503,7 @@ class TextPreprocessor:
|
|
503 |
def __remove_double_whitespaces(string: str):
|
504 |
return " ".join(string.split())
|
505 |
|
506 |
-
def __remove_url(self, string_series: pd.Series):
|
507 |
"""
|
508 |
Removes URLs m text
|
509 |
:param string_series: pd.Series, input string series
|
@@ -514,7 +514,7 @@ class TextPreprocessor:
|
|
514 |
repl=" ", regex=True).copy()
|
515 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
516 |
|
517 |
-
def __expand(self, string_series: pd.Series):
|
518 |
"""
|
519 |
Replaces contractions with expansions. eg. don't wit do not.
|
520 |
:param string_series: pd.Series, input string series
|
@@ -525,7 +525,7 @@ class TextPreprocessor:
|
|
525 |
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
|
526 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
527 |
|
528 |
-
def __remove_punct(self, string_series: pd.Series):
|
529 |
"""
|
530 |
Removes punctuations from the input string.
|
531 |
:param string_series: pd.Series, input string series
|
@@ -538,7 +538,7 @@ class TextPreprocessor:
|
|
538 |
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
|
539 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
540 |
|
541 |
-
def __remove_digits(self, string_series: pd.Series):
|
542 |
"""
|
543 |
Removes digits from the input string.
|
544 |
:param string_series: pd.Series, input string series
|
@@ -548,7 +548,7 @@ class TextPreprocessor:
|
|
548 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
549 |
|
550 |
@staticmethod
|
551 |
-
def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
|
552 |
"""
|
553 |
Reomves words/tokens where minlen <= len <= maxlen.
|
554 |
:param string_series: pd.Series, input string series
|
@@ -560,7 +560,7 @@ class TextPreprocessor:
|
|
560 |
(len(word) > maxlen) or (len(word) < minlen)]))
|
561 |
return clean_string_series
|
562 |
|
563 |
-
def __remove_stop_words(self, string_series: pd.Series):
|
564 |
"""
|
565 |
Removes stop words from the input string.
|
566 |
:param string_series: pd.Series, input string series
|
@@ -572,7 +572,7 @@ class TextPreprocessor:
|
|
572 |
|
573 |
return string_series.map(str_remove_stop_words)
|
574 |
|
575 |
-
def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
|
576 |
bottom_p: int = None, dataset: str = 'train'):
|
577 |
"""
|
578 |
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
|
@@ -605,7 +605,7 @@ class TextPreprocessor:
|
|
605 |
if word not in self.words_to_remove]))
|
606 |
return clean_string_series
|
607 |
|
608 |
-
def preprocess(self, string_series: pd.Series, dataset: str = "train"):
|
609 |
"""
|
610 |
Entry point.
|
611 |
:param string_series: pd.Series, input string series
|
@@ -614,20 +614,20 @@ class TextPreprocessor:
|
|
614 |
"""
|
615 |
string_series = string_series.str.lower().copy()
|
616 |
string_series = string_series.map(unidecode).copy()
|
617 |
-
string_series = self.__remove_url(string_series=string_series)
|
618 |
-
string_series = self.__expand(string_series=string_series)
|
619 |
|
620 |
if self.remove_punct:
|
621 |
-
string_series = self.__remove_punct(string_series=string_series)
|
622 |
if self.remove_digits:
|
623 |
-
string_series = self.__remove_digits(string_series=string_series)
|
624 |
if self.remove_stop_words:
|
625 |
-
string_series = self.__remove_stop_words(string_series=string_series)
|
626 |
if self.remove_short_words:
|
627 |
-
string_series = self.__remove_short_words(string_series=string_series,
|
628 |
minlen=self.minlen,
|
629 |
maxlen=self.maxlen)
|
630 |
-
string_series = self.__remove_top_bottom_words(string_series=string_series,
|
631 |
top_p=self.top_p,
|
632 |
bottom_p=self.bottom_p, dataset=dataset)
|
633 |
|
@@ -637,9 +637,9 @@ class TextPreprocessor:
|
|
637 |
return string_series
|
638 |
|
639 |
|
640 |
-
def get_frequent_words_html(df):
|
641 |
text_preprocess = TextPreprocessor()
|
642 |
-
preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
|
643 |
counter = Counter(' '.join([*preprocessed_txt]).split())
|
644 |
|
645 |
freq_tokens_html = '<div class="word-cloud-container">'
|
|
|
503 |
def __remove_double_whitespaces(string: str):
|
504 |
return " ".join(string.split())
|
505 |
|
506 |
+
async def __remove_url(self, string_series: pd.Series):
|
507 |
"""
|
508 |
Removes URLs m text
|
509 |
:param string_series: pd.Series, input string series
|
|
|
514 |
repl=" ", regex=True).copy()
|
515 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
516 |
|
517 |
+
async def __expand(self, string_series: pd.Series):
|
518 |
"""
|
519 |
Replaces contractions with expansions. eg. don't wit do not.
|
520 |
:param string_series: pd.Series, input string series
|
|
|
525 |
clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
|
526 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
527 |
|
528 |
+
async def __remove_punct(self, string_series: pd.Series):
|
529 |
"""
|
530 |
Removes punctuations from the input string.
|
531 |
:param string_series: pd.Series, input string series
|
|
|
538 |
clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
|
539 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
540 |
|
541 |
+
async def __remove_digits(self, string_series: pd.Series):
|
542 |
"""
|
543 |
Removes digits from the input string.
|
544 |
:param string_series: pd.Series, input string series
|
|
|
548 |
return clean_string_series.map(self.__remove_double_whitespaces)
|
549 |
|
550 |
@staticmethod
|
551 |
+
async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
|
552 |
"""
|
553 |
Reomves words/tokens where minlen <= len <= maxlen.
|
554 |
:param string_series: pd.Series, input string series
|
|
|
560 |
(len(word) > maxlen) or (len(word) < minlen)]))
|
561 |
return clean_string_series
|
562 |
|
563 |
+
async def __remove_stop_words(self, string_series: pd.Series):
|
564 |
"""
|
565 |
Removes stop words from the input string.
|
566 |
:param string_series: pd.Series, input string series
|
|
|
572 |
|
573 |
return string_series.map(str_remove_stop_words)
|
574 |
|
575 |
+
async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
|
576 |
bottom_p: int = None, dataset: str = 'train'):
|
577 |
"""
|
578 |
Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
|
|
|
605 |
if word not in self.words_to_remove]))
|
606 |
return clean_string_series
|
607 |
|
608 |
+
async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
|
609 |
"""
|
610 |
Entry point.
|
611 |
:param string_series: pd.Series, input string series
|
|
|
614 |
"""
|
615 |
string_series = string_series.str.lower().copy()
|
616 |
string_series = string_series.map(unidecode).copy()
|
617 |
+
string_series = await self.__remove_url(string_series=string_series)
|
618 |
+
string_series = await self.__expand(string_series=string_series)
|
619 |
|
620 |
if self.remove_punct:
|
621 |
+
string_series = await self.__remove_punct(string_series=string_series)
|
622 |
if self.remove_digits:
|
623 |
+
string_series = await self.__remove_digits(string_series=string_series)
|
624 |
if self.remove_stop_words:
|
625 |
+
string_series = await self.__remove_stop_words(string_series=string_series)
|
626 |
if self.remove_short_words:
|
627 |
+
string_series = await self.__remove_short_words(string_series=string_series,
|
628 |
minlen=self.minlen,
|
629 |
maxlen=self.maxlen)
|
630 |
+
string_series = await self.__remove_top_bottom_words(string_series=string_series,
|
631 |
top_p=self.top_p,
|
632 |
bottom_p=self.bottom_p, dataset=dataset)
|
633 |
|
|
|
637 |
return string_series
|
638 |
|
639 |
|
640 |
+
async def get_frequent_words_html(df):
|
641 |
text_preprocess = TextPreprocessor()
|
642 |
+
preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
|
643 |
counter = Counter(' '.join([*preprocessed_txt]).split())
|
644 |
|
645 |
freq_tokens_html = '<div class="word-cloud-container">'
|