Spaces:

Booguy
/

linguask

Build error

linguask / src /text_preprocessings /spellcheck_preprocessing.py

GitHub Action

refs/heads/ci-cd/hugging-face

8b414b0 almost 2 years ago

982 Bytes

	from functools import lru_cache

	import pandas as pd
	from tqdm import tqdm

	from src.feature_extractors.text_statistics_utils import preprocess_test
	from src.spell_checker import SmartSpellChecker
	from src.text_preprocessings.base_preprocessing import BasePreprocessor


	class SpellcheckTextPreprocessor(BasePreprocessor):
	def __init__(self, spellcheck: SmartSpellChecker):
	super(SpellcheckTextPreprocessor, self).__init__()
	self._spellcheck = spellcheck

	def preprocess_data(self, data: pd.Series) -> pd.Series:
	out_texts = []

	for text in tqdm(data, desc="Preprocessing texts (correcting mistakes, removing tokens, etc.)..."):
	text = self._preprocess_text(text)
	out_texts.append(text)

	return pd.Series(out_texts, index=data.index)

	@lru_cache()
	def _preprocess_text(self, text: str) -> str:
	text = preprocess_test(text)
	text = self._spellcheck.correct_text(text)
	return text