|
from functools import reduce |
|
from typing import Dict, List |
|
|
|
import pandas as pd |
|
from tqdm import tqdm |
|
|
|
from src.feature_extractors.base_extractor import BaseExtractor |
|
from src.feature_extractors.text_statistics_utils import ( |
|
count_how_many_words_are_repeating, count_misspelled_words, |
|
count_punctuation, count_words, preprocess_test) |
|
from src.spell_checker import SmartSpellChecker |
|
|
|
|
|
class HandcraftedTextFeatureExtractor(BaseExtractor): |
|
def __init__(self, spellcheck: SmartSpellChecker): |
|
self._spellcheck = spellcheck |
|
|
|
def _generate_features(self, raw_text: str) -> Dict[str, int]: |
|
preprocessed_text = preprocess_test(raw_text) |
|
cleaned_text = self._spellcheck.correct_text(preprocessed_text) |
|
|
|
features: List[Dict] = [ |
|
count_punctuation(raw_text), |
|
count_misspelled_words(preprocessed_text, self._spellcheck), |
|
count_words(cleaned_text), |
|
count_how_many_words_are_repeating(cleaned_text) |
|
] |
|
|
|
return reduce(lambda x, y: {**x, **y}, features) |
|
|
|
def generate_features(self, data: pd.Series) -> pd.DataFrame: |
|
features = [self._generate_features(text) for text in tqdm(data, desc="Gen. handcraft text features...")] |
|
return pd.DataFrame(features, index=data.index) |
|
|