Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -12,11 +12,67 @@ import nltk
|
|
12 |
# # English words from NLTK corpus
|
13 |
# english_words = set(nltk.corpus.words.words())
|
14 |
|
15 |
-
with open("index.dic") as f:
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
def is_english_word(word):
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
# Define Devanagari digits and patterns for matching
|
|
|
12 |
# # English words from NLTK corpus
|
13 |
# english_words = set(nltk.corpus.words.words())
|
14 |
|
15 |
+
# with open("index.dic") as f:
|
16 |
+
# hunspell_words = {line.split("/")[0].strip() for line in f if not line.startswith("#")}
|
17 |
+
|
18 |
+
# def is_english_word(word):
|
19 |
+
# return word.lower() in hunspell_words
|
20 |
+
|
21 |
+
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
22 |
+
from nltk.corpus import words, wordnet
|
23 |
+
import spacy
|
24 |
+
from spellchecker import SpellChecker
|
25 |
+
import string
|
26 |
+
|
27 |
+
# Download necessary NLTK resources
|
28 |
+
nltk.download('wordnet')
|
29 |
+
nltk.download('words')
|
30 |
+
|
31 |
+
# Initialize tools
|
32 |
+
lemmatizer = WordNetLemmatizer()
|
33 |
+
stemmer = PorterStemmer()
|
34 |
+
english_words = set(words.words())
|
35 |
+
nlp = spacy.load("en_core_web_sm") # SpaCy language model
|
36 |
+
spell = SpellChecker() # Spell checker
|
37 |
+
|
38 |
+
# Combine dictionaries for better coverage
|
39 |
+
combined_dictionary = english_words.union(spell.word_frequency.keys())
|
40 |
|
41 |
def is_english_word(word):
|
42 |
+
"""
|
43 |
+
Checks if a word is English and returns the valid English word or None if not recognized.
|
44 |
+
"""
|
45 |
+
# Preprocess the word: strip punctuation and lowercase
|
46 |
+
word_cleaned = word.lower().strip(string.punctuation)
|
47 |
+
if not word_cleaned:
|
48 |
+
return None
|
49 |
+
|
50 |
+
# 1. Direct dictionary match
|
51 |
+
if word_cleaned in combined_dictionary:
|
52 |
+
return word_cleaned
|
53 |
+
|
54 |
+
# 2. Lemmatization
|
55 |
+
lemma = lemmatizer.lemmatize(word_cleaned)
|
56 |
+
if lemma in combined_dictionary:
|
57 |
+
return lemma
|
58 |
+
|
59 |
+
# 3. Stemming
|
60 |
+
stem = stemmer.stem(word_cleaned)
|
61 |
+
if stem in combined_dictionary:
|
62 |
+
return stem
|
63 |
+
|
64 |
+
# 4. Spell checker
|
65 |
+
corrected_word = spell.correction(word_cleaned)
|
66 |
+
if corrected_word in combined_dictionary:
|
67 |
+
return corrected_word
|
68 |
+
|
69 |
+
# 5. SpaCy's language model (check if token is recognized as English)
|
70 |
+
doc = nlp(word_cleaned)
|
71 |
+
if doc and doc[0].is_alpha and doc[0].lang_ == "en":
|
72 |
+
return word_cleaned
|
73 |
+
|
74 |
+
return None
|
75 |
+
|
76 |
|
77 |
|
78 |
# Define Devanagari digits and patterns for matching
|