Spaces:

rockerritesh
/

preeti-unicode

Sleeping

App Files Files Community

rockerritesh commited on Dec 7, 2024

Commit

9e484af

verified ·

1 Parent(s): 3600617

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -3

app.py CHANGED Viewed

@@ -12,11 +12,67 @@ import nltk
 # # English words from NLTK corpus
 # english_words = set(nltk.corpus.words.words())
-with open("index.dic") as f:
-    hunspell_words = {line.split("/")[0].strip() for line in f if not line.startswith("#")}
 def is_english_word(word):
-    return word.lower() in hunspell_words
 # Define Devanagari digits and patterns for matching

 # # English words from NLTK corpus
 # english_words = set(nltk.corpus.words.words())
+# with open("index.dic") as f:
+#     hunspell_words = {line.split("/")[0].strip() for line in f if not line.startswith("#")}
+# def is_english_word(word):
+#     return word.lower() in hunspell_words
+from nltk.stem import WordNetLemmatizer, PorterStemmer
+from nltk.corpus import words, wordnet
+import spacy
+from spellchecker import SpellChecker
+import string
+# Download necessary NLTK resources
+nltk.download('wordnet')
+nltk.download('words')
+# Initialize tools
+lemmatizer = WordNetLemmatizer()
+stemmer = PorterStemmer()
+english_words = set(words.words())
+nlp = spacy.load("en_core_web_sm")  # SpaCy language model
+spell = SpellChecker()  # Spell checker
+# Combine dictionaries for better coverage
+combined_dictionary = english_words.union(spell.word_frequency.keys())
 def is_english_word(word):
+    """
+    Checks if a word is English and returns the valid English word or None if not recognized.
+    """
+    # Preprocess the word: strip punctuation and lowercase
+    word_cleaned = word.lower().strip(string.punctuation)
+    if not word_cleaned:
+        return None
+    # 1. Direct dictionary match
+    if word_cleaned in combined_dictionary:
+        return word_cleaned
+    # 2. Lemmatization
+    lemma = lemmatizer.lemmatize(word_cleaned)
+    if lemma in combined_dictionary:
+        return lemma
+    # 3. Stemming
+    stem = stemmer.stem(word_cleaned)
+    if stem in combined_dictionary:
+        return stem
+    # 4. Spell checker
+    corrected_word = spell.correction(word_cleaned)
+    if corrected_word in combined_dictionary:
+        return corrected_word
+    # 5. SpaCy's language model (check if token is recognized as English)
+    doc = nlp(word_cleaned)
+    if doc and doc[0].is_alpha and doc[0].lang_ == "en":
+        return word_cleaned
+    return None
 # Define Devanagari digits and patterns for matching