linguask / src /feature_extractors /term_frequency_feature_extractor.py
GitHub Action
refs/heads/ci-cd/hugging-face
8b414b0
from collections import defaultdict
from typing import Dict, List
import nltk
import numpy as np
import pandas as pd
from src.feature_extractors.base_extractor import BaseExtractor
class TermFrequencyFeatureExtractor(BaseExtractor):
"""Build a dataframe with a distribution of term frequencies
Usage:
>>> data = pd.read_csv("data/raw/train.csv").set_index("text_id")
>>> featurizer = TermFrequencyFeaturizer()
>>> X = featurizer.featurize(data.full_text)
>>> y = data["vocabulary"]
>>> model = catboost.CatBoostRegressor()
>>> model.fit(x_train, y_train)
Possible improvements:
- Add word corrections: triying -> trying
- Count not only word frequencies, but number of unique words in each hist bin
"""
MAX_TERM_FREQUENCY = 23135751162 # it's so in the term frequency dataset
def __init__(self, n_bins: int = 40):
self.term2freq: Dict[str, int] = self._load_term2freq_dict()
self.bins = self._make_bins(n_bins)
self.feature_names = [
f"bin_{round(self.bins[i], 1)}_{round(self.bins[i+1], 1)}"
for i in range(len(self.bins) - 1)
]
nltk.download("punkt")
def _make_bins(self, n_bins: int) -> np.ndarray:
min_bin = 0
max_bin = np.log1p(self.MAX_TERM_FREQUENCY)
bins = np.linspace(min_bin, max_bin, n_bins)
return bins
def _load_term2freq_dict(self) -> Dict[str, int]:
term_frequencies = pd.read_csv("data/word_frequencies/unigram_freq.csv")
term2freq: Dict[str, int] = defaultdict(lambda: 0)
term2freq.update(term_frequencies.set_index("word").to_dict()["count"])
return term2freq
def generate_features(self, data: pd.Series) -> pd.DataFrame:
"""Extracts features from the text in the form of histogram of word frequencies
Logarithm operation is applied to the frequencies for the sake of distribution
normality.
"""
feature_df = data.apply(self._compute_word_frequency_histogram)
feature_df.columns = self.feature_names
return feature_df
def _compute_word_frequency_histogram(self, text: str) -> pd.Series:
term_frequencies: List[int] = self._compute_term_frequencies_from_text(text)
histogram_values: np.ndarray = self._build_histogram(term_frequencies)
return pd.Series(histogram_values)
def _compute_term_frequencies_from_text(self, text: str) -> List[int]:
tokens = nltk.tokenize.word_tokenize(text)
words = [token.lower() for token in tokens if token.isalpha()]
word_frequencies = [self.term2freq[word] for word in words]
return word_frequencies
def _build_histogram(self, values: List[int]) -> np.ndarray:
values_log = np.log1p(values)
histogram, __ = np.histogram(values_log, bins=self.bins)
normalized_histogram = histogram / len(values)
return normalized_histogram