ARIN_7102 / nltk_u.py
henry2024's picture
nice
40ee52c
raw
history blame
835 Bytes
# Import Libraries
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer
stemmer= SnowballStemmer(language= 'english')
from nltk.corpus import stopwords
nltk.download('stopwords')
# Tokenize text i.e make all text be in a list format e.g "I am sick" = ['i', 'am', 'sick']
def tokenize(text):
return [stemmer.stem(token) for token in word_tokenize(text)]
# Create stopwords to reduce noise in data
english_stopwords= stopwords.words('english')
# Create a vectosizer to learn all words in order to convert them into numbers
def vectorizer():
vectorizer= TfidfVectorizer(tokenizer=tokenize,
stop_words=english_stopwords,
)
return vectorizer