Spaces:

Booguy
/

linguask

Build error

linguask / src /spell_checker.py

GitHub Action

refs/heads/ci-cd/hugging-face

8b414b0 almost 2 years ago

2.1 kB

	__all__ = ['SmartSpellChecker']

	import re
	from collections import Counter
	from functools import lru_cache
	from typing import Iterable

	from spellchecker import SpellChecker


	def get_word_counter(text: str):
	# removes punctuation and count words in sentence
	text = re.sub(r"[.,!?;:]", " ", text)

	return Counter(text.split())


	custom_mappings = {
	'alot': 'a lot',
	'classwork': 'class work',
	'everytime': 'every time',
	'loosing': 'losing',
	'clases': 'classes',
	'payed': 'paid',
	'learnd': 'learned',
	'ect': 'etc',
	'wasnt': "wasn't",
	'wich': 'which',
	"sol's": 'souls',
	'thigs': 'things',
	'activies': 'activities',
	'oline': 'online',
	'thru': 'through',
	'inconclusion': 'in conclusion',
	}

	skipped_mappings = {
	' u ': ' you ',
	'youll': "you will",
	'wont': "won't"}

	exclude_words_from_check = {
	"you're", 'covid'
	}

	black_list = {'ther', "waldo's", "f's", ""}


	class SmartSpellChecker:
	def __init__(self):
	self.spellcheck = SpellChecker()

	@lru_cache(maxsize=None)
	def correct_word(self, mismatch: str):
	if mismatch in custom_mappings:
	return custom_mappings[mismatch]

	if mismatch in black_list:
	return ""

	if mismatch in exclude_words_from_check:
	return None

	# sometimes spellcheck thinks 'b' or 'c' if misspelled words
	# this condition > 2 is needed
	if len(mismatch) <= 2:
	return None

	return self.spellcheck.correction(mismatch)

	def correct_text(self, text: str):
	for key, value in skipped_mappings.items():
	if key in text:
	text = text.replace(key, value)

	word_count = get_word_counter(text)

	unknown_words = self.unknown(word_count)
	for misspell in unknown_words:
	correct = self.correct_word(misspell)
	if correct is not None:
	text = text.replace(misspell, correct)

	return text

	def unknown(self, words: Iterable):
	return self.spellcheck.unknown(words)