Spaces:
Build error
Build error
ShAnSantosh
commited on
Commit
•
b289e03
1
Parent(s):
099c56e
Upload nltk_utils.py
Browse files- nltk_utils.py +43 -0
nltk_utils.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import nltk
|
3 |
+
nltk.download('punkt')
|
4 |
+
from nltk.stem.porter import PorterStemmer
|
5 |
+
stemmer = PorterStemmer()
|
6 |
+
|
7 |
+
def tokenize(sentence):
|
8 |
+
"""
|
9 |
+
split sentence into array of words/tokens
|
10 |
+
a token can be a word or punctuation character, or number
|
11 |
+
"""
|
12 |
+
return nltk.word_tokenize(sentence)
|
13 |
+
|
14 |
+
|
15 |
+
def stem(word):
|
16 |
+
"""
|
17 |
+
stemming = find the root form of the word
|
18 |
+
examples:
|
19 |
+
words = ["organize", "organizes", "organizing"]
|
20 |
+
words = [stem(w) for w in words]
|
21 |
+
-> ["organ", "organ", "organ"]
|
22 |
+
"""
|
23 |
+
return stemmer.stem(word.lower())
|
24 |
+
|
25 |
+
|
26 |
+
def bag_of_words(tokenized_sentence, words):
|
27 |
+
"""
|
28 |
+
return bag of words array:
|
29 |
+
1 for each known word that exists in the sentence, 0 otherwise
|
30 |
+
example:
|
31 |
+
sentence = ["hello", "how", "are", "you"]
|
32 |
+
words = ["hi", "hello", "I", "you", "bye", "thank", "cool"]
|
33 |
+
bog = [ 0 , 1 , 0 , 1 , 0 , 0 , 0]
|
34 |
+
"""
|
35 |
+
# stem each word
|
36 |
+
sentence_words = [stem(word) for word in tokenized_sentence]
|
37 |
+
# initialize bag with 0 for each word
|
38 |
+
bag = np.zeros(len(words), dtype=np.float32)
|
39 |
+
for idx, w in enumerate(words):
|
40 |
+
if w in sentence_words:
|
41 |
+
bag[idx] = 1
|
42 |
+
|
43 |
+
return bag
|