linguask / src /spell_checker.py
GitHub Action
refs/heads/ci-cd/hugging-face
8b414b0
__all__ = ['SmartSpellChecker']
import re
from collections import Counter
from functools import lru_cache
from typing import Iterable
from spellchecker import SpellChecker
def get_word_counter(text: str):
# removes punctuation and count words in sentence
text = re.sub(r"[.,!?;:]", " ", text)
return Counter(text.split())
custom_mappings = {
'alot': 'a lot',
'classwork': 'class work',
'everytime': 'every time',
'loosing': 'losing',
'clases': 'classes',
'payed': 'paid',
'learnd': 'learned',
'ect': 'etc',
'wasnt': "wasn't",
'wich': 'which',
"sol's": 'souls',
'thigs': 'things',
'activies': 'activities',
'oline': 'online',
'thru': 'through',
'inconclusion': 'in conclusion',
}
skipped_mappings = {
' u ': ' you ',
'youll': "you will",
'wont': "won't"}
exclude_words_from_check = {
"you're", 'covid'
}
black_list = {'ther', "waldo's", "f's", ""}
class SmartSpellChecker:
def __init__(self):
self.spellcheck = SpellChecker()
@lru_cache(maxsize=None)
def correct_word(self, mismatch: str):
if mismatch in custom_mappings:
return custom_mappings[mismatch]
if mismatch in black_list:
return ""
if mismatch in exclude_words_from_check:
return None
# sometimes spellcheck thinks 'b' or 'c' if misspelled words
# this condition > 2 is needed
if len(mismatch) <= 2:
return None
return self.spellcheck.correction(mismatch)
def correct_text(self, text: str):
for key, value in skipped_mappings.items():
if key in text:
text = text.replace(key, value)
word_count = get_word_counter(text)
unknown_words = self.unknown(word_count)
for misspell in unknown_words:
correct = self.correct_word(misspell)
if correct is not None:
text = text.replace(misspell, correct)
return text
def unknown(self, words: Iterable):
return self.spellcheck.unknown(words)