linguask / src /text_preprocessings /spellcheck_preprocessing.py
GitHub Action
refs/heads/ci-cd/hugging-face
8b414b0
raw
history blame contribute delete
982 Bytes
from functools import lru_cache
import pandas as pd
from tqdm import tqdm
from src.feature_extractors.text_statistics_utils import preprocess_test
from src.spell_checker import SmartSpellChecker
from src.text_preprocessings.base_preprocessing import BasePreprocessor
class SpellcheckTextPreprocessor(BasePreprocessor):
def __init__(self, spellcheck: SmartSpellChecker):
super(SpellcheckTextPreprocessor, self).__init__()
self._spellcheck = spellcheck
def preprocess_data(self, data: pd.Series) -> pd.Series:
out_texts = []
for text in tqdm(data, desc="Preprocessing texts (correcting mistakes, removing tokens, etc.)..."):
text = self._preprocess_text(text)
out_texts.append(text)
return pd.Series(out_texts, index=data.index)
@lru_cache()
def _preprocess_text(self, text: str) -> str:
text = preprocess_test(text)
text = self._spellcheck.correct_text(text)
return text