|
from collections import defaultdict |
|
from typing import Dict, List |
|
|
|
import nltk |
|
import numpy as np |
|
import pandas as pd |
|
|
|
from src.feature_extractors.base_extractor import BaseExtractor |
|
|
|
|
|
class TermFrequencyFeatureExtractor(BaseExtractor): |
|
"""Build a dataframe with a distribution of term frequencies |
|
|
|
Usage: |
|
>>> data = pd.read_csv("data/raw/train.csv").set_index("text_id") |
|
>>> featurizer = TermFrequencyFeaturizer() |
|
>>> X = featurizer.featurize(data.full_text) |
|
>>> y = data["vocabulary"] |
|
>>> model = catboost.CatBoostRegressor() |
|
>>> model.fit(x_train, y_train) |
|
|
|
Possible improvements: |
|
- Add word corrections: triying -> trying |
|
- Count not only word frequencies, but number of unique words in each hist bin |
|
""" |
|
|
|
MAX_TERM_FREQUENCY = 23135751162 |
|
|
|
def __init__(self, n_bins: int = 40): |
|
self.term2freq: Dict[str, int] = self._load_term2freq_dict() |
|
self.bins = self._make_bins(n_bins) |
|
self.feature_names = [ |
|
f"bin_{round(self.bins[i], 1)}_{round(self.bins[i+1], 1)}" |
|
for i in range(len(self.bins) - 1) |
|
] |
|
nltk.download("punkt") |
|
|
|
def _make_bins(self, n_bins: int) -> np.ndarray: |
|
min_bin = 0 |
|
max_bin = np.log1p(self.MAX_TERM_FREQUENCY) |
|
bins = np.linspace(min_bin, max_bin, n_bins) |
|
return bins |
|
|
|
def _load_term2freq_dict(self) -> Dict[str, int]: |
|
term_frequencies = pd.read_csv("data/word_frequencies/unigram_freq.csv") |
|
term2freq: Dict[str, int] = defaultdict(lambda: 0) |
|
term2freq.update(term_frequencies.set_index("word").to_dict()["count"]) |
|
return term2freq |
|
|
|
def generate_features(self, data: pd.Series) -> pd.DataFrame: |
|
"""Extracts features from the text in the form of histogram of word frequencies |
|
|
|
Logarithm operation is applied to the frequencies for the sake of distribution |
|
normality. |
|
""" |
|
feature_df = data.apply(self._compute_word_frequency_histogram) |
|
feature_df.columns = self.feature_names |
|
return feature_df |
|
|
|
def _compute_word_frequency_histogram(self, text: str) -> pd.Series: |
|
term_frequencies: List[int] = self._compute_term_frequencies_from_text(text) |
|
histogram_values: np.ndarray = self._build_histogram(term_frequencies) |
|
return pd.Series(histogram_values) |
|
|
|
def _compute_term_frequencies_from_text(self, text: str) -> List[int]: |
|
tokens = nltk.tokenize.word_tokenize(text) |
|
words = [token.lower() for token in tokens if token.isalpha()] |
|
word_frequencies = [self.term2freq[word] for word in words] |
|
return word_frequencies |
|
|
|
def _build_histogram(self, values: List[int]) -> np.ndarray: |
|
values_log = np.log1p(values) |
|
histogram, __ = np.histogram(values_log, bins=self.bins) |
|
normalized_histogram = histogram / len(values) |
|
return normalized_histogram |
|
|