ksvmuralidhar commited on
Commit
8975be1
1 Parent(s): a87c8fa

Update word_cloud.py

Browse files
Files changed (1) hide show
  1. word_cloud.py +17 -17
word_cloud.py CHANGED
@@ -503,7 +503,7 @@ class TextPreprocessor:
503
  def __remove_double_whitespaces(string: str):
504
  return " ".join(string.split())
505
 
506
- def __remove_url(self, string_series: pd.Series):
507
  """
508
  Removes URLs m text
509
  :param string_series: pd.Series, input string series
@@ -514,7 +514,7 @@ class TextPreprocessor:
514
  repl=" ", regex=True).copy()
515
  return clean_string_series.map(self.__remove_double_whitespaces)
516
 
517
- def __expand(self, string_series: pd.Series):
518
  """
519
  Replaces contractions with expansions. eg. don't wit do not.
520
  :param string_series: pd.Series, input string series
@@ -525,7 +525,7 @@ class TextPreprocessor:
525
  clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
  return clean_string_series.map(self.__remove_double_whitespaces)
527
 
528
- def __remove_punct(self, string_series: pd.Series):
529
  """
530
  Removes punctuations from the input string.
531
  :param string_series: pd.Series, input string series
@@ -538,7 +538,7 @@ class TextPreprocessor:
538
  clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
  return clean_string_series.map(self.__remove_double_whitespaces)
540
 
541
- def __remove_digits(self, string_series: pd.Series):
542
  """
543
  Removes digits from the input string.
544
  :param string_series: pd.Series, input string series
@@ -548,7 +548,7 @@ class TextPreprocessor:
548
  return clean_string_series.map(self.__remove_double_whitespaces)
549
 
550
  @staticmethod
551
- def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
  """
553
  Reomves words/tokens where minlen <= len <= maxlen.
554
  :param string_series: pd.Series, input string series
@@ -560,7 +560,7 @@ class TextPreprocessor:
560
  (len(word) > maxlen) or (len(word) < minlen)]))
561
  return clean_string_series
562
 
563
- def __remove_stop_words(self, string_series: pd.Series):
564
  """
565
  Removes stop words from the input string.
566
  :param string_series: pd.Series, input string series
@@ -572,7 +572,7 @@ class TextPreprocessor:
572
 
573
  return string_series.map(str_remove_stop_words)
574
 
575
- def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
  bottom_p: int = None, dataset: str = 'train'):
577
  """
578
  Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
@@ -605,7 +605,7 @@ class TextPreprocessor:
605
  if word not in self.words_to_remove]))
606
  return clean_string_series
607
 
608
- def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
  """
610
  Entry point.
611
  :param string_series: pd.Series, input string series
@@ -614,20 +614,20 @@ class TextPreprocessor:
614
  """
615
  string_series = string_series.str.lower().copy()
616
  string_series = string_series.map(unidecode).copy()
617
- string_series = self.__remove_url(string_series=string_series)
618
- string_series = self.__expand(string_series=string_series)
619
 
620
  if self.remove_punct:
621
- string_series = self.__remove_punct(string_series=string_series)
622
  if self.remove_digits:
623
- string_series = self.__remove_digits(string_series=string_series)
624
  if self.remove_stop_words:
625
- string_series = self.__remove_stop_words(string_series=string_series)
626
  if self.remove_short_words:
627
- string_series = self.__remove_short_words(string_series=string_series,
628
  minlen=self.minlen,
629
  maxlen=self.maxlen)
630
- string_series = self.__remove_top_bottom_words(string_series=string_series,
631
  top_p=self.top_p,
632
  bottom_p=self.bottom_p, dataset=dataset)
633
 
@@ -637,9 +637,9 @@ class TextPreprocessor:
637
  return string_series
638
 
639
 
640
- def get_frequent_words_html(df):
641
  text_preprocess = TextPreprocessor()
642
- preprocessed_txt = text_preprocess.preprocess(df['title'] + ' ' + df['description'])
643
  counter = Counter(' '.join([*preprocessed_txt]).split())
644
 
645
  freq_tokens_html = '<div class="word-cloud-container">'
 
503
  def __remove_double_whitespaces(string: str):
504
  return " ".join(string.split())
505
 
506
+ async def __remove_url(self, string_series: pd.Series):
507
  """
508
  Removes URLs m text
509
  :param string_series: pd.Series, input string series
 
514
  repl=" ", regex=True).copy()
515
  return clean_string_series.map(self.__remove_double_whitespaces)
516
 
517
+ async def __expand(self, string_series: pd.Series):
518
  """
519
  Replaces contractions with expansions. eg. don't wit do not.
520
  :param string_series: pd.Series, input string series
 
525
  clean_string_series = clean_string_series.str.replace(pat=c, repl=e, regex=False).copy()
526
  return clean_string_series.map(self.__remove_double_whitespaces)
527
 
528
+ async def __remove_punct(self, string_series: pd.Series):
529
  """
530
  Removes punctuations from the input string.
531
  :param string_series: pd.Series, input string series
 
538
  clean_string_series = clean_string_series.str.replace(pat=i, repl=" ", regex=False).copy()
539
  return clean_string_series.map(self.__remove_double_whitespaces)
540
 
541
+ async def __remove_digits(self, string_series: pd.Series):
542
  """
543
  Removes digits from the input string.
544
  :param string_series: pd.Series, input string series
 
548
  return clean_string_series.map(self.__remove_double_whitespaces)
549
 
550
  @staticmethod
551
+ async def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
552
  """
553
  Reomves words/tokens where minlen <= len <= maxlen.
554
  :param string_series: pd.Series, input string series
 
560
  (len(word) > maxlen) or (len(word) < minlen)]))
561
  return clean_string_series
562
 
563
+ async def __remove_stop_words(self, string_series: pd.Series):
564
  """
565
  Removes stop words from the input string.
566
  :param string_series: pd.Series, input string series
 
572
 
573
  return string_series.map(str_remove_stop_words)
574
 
575
+ async def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
576
  bottom_p: int = None, dataset: str = 'train'):
577
  """
578
  Reomoves top_p percent (frequent) words and bottom_p percent (rare) words.
 
605
  if word not in self.words_to_remove]))
606
  return clean_string_series
607
 
608
+ async def preprocess(self, string_series: pd.Series, dataset: str = "train"):
609
  """
610
  Entry point.
611
  :param string_series: pd.Series, input string series
 
614
  """
615
  string_series = string_series.str.lower().copy()
616
  string_series = string_series.map(unidecode).copy()
617
+ string_series = await self.__remove_url(string_series=string_series)
618
+ string_series = await self.__expand(string_series=string_series)
619
 
620
  if self.remove_punct:
621
+ string_series = await self.__remove_punct(string_series=string_series)
622
  if self.remove_digits:
623
+ string_series = await self.__remove_digits(string_series=string_series)
624
  if self.remove_stop_words:
625
+ string_series = await self.__remove_stop_words(string_series=string_series)
626
  if self.remove_short_words:
627
+ string_series = await self.__remove_short_words(string_series=string_series,
628
  minlen=self.minlen,
629
  maxlen=self.maxlen)
630
+ string_series = await self.__remove_top_bottom_words(string_series=string_series,
631
  top_p=self.top_p,
632
  bottom_p=self.bottom_p, dataset=dataset)
633
 
 
637
  return string_series
638
 
639
 
640
+ async def get_frequent_words_html(df):
641
  text_preprocess = TextPreprocessor()
642
+ preprocessed_txt = await text_preprocess.preprocess(df['title'] + ' ' + df['description'])
643
  counter = Counter(' '.join([*preprocessed_txt]).split())
644
 
645
  freq_tokens_html = '<div class="word-cloud-container">'