File size: 1,287 Bytes
8b414b0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
from functools import reduce
from typing import Dict, List
import pandas as pd
from tqdm import tqdm
from src.feature_extractors.base_extractor import BaseExtractor
from src.feature_extractors.text_statistics_utils import (
count_how_many_words_are_repeating, count_misspelled_words,
count_punctuation, count_words, preprocess_test)
from src.spell_checker import SmartSpellChecker
class HandcraftedTextFeatureExtractor(BaseExtractor):
def __init__(self, spellcheck: SmartSpellChecker):
self._spellcheck = spellcheck
def _generate_features(self, raw_text: str) -> Dict[str, int]:
preprocessed_text = preprocess_test(raw_text)
cleaned_text = self._spellcheck.correct_text(preprocessed_text)
features: List[Dict] = [
count_punctuation(raw_text),
count_misspelled_words(preprocessed_text, self._spellcheck),
count_words(cleaned_text),
count_how_many_words_are_repeating(cleaned_text)
]
return reduce(lambda x, y: {**x, **y}, features)
def generate_features(self, data: pd.Series) -> pd.DataFrame:
features = [self._generate_features(text) for text in tqdm(data, desc="Gen. handcraft text features...")]
return pd.DataFrame(features, index=data.index)
|