|
|
|
import nltk |
|
from nltk.tokenize import word_tokenize |
|
nltk.download('punkt') |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from nltk.stem import SnowballStemmer |
|
stemmer= SnowballStemmer(language= 'english') |
|
from nltk.corpus import stopwords |
|
nltk.download('stopwords') |
|
|
|
|
|
def tokenize(text): |
|
return [stemmer.stem(token) for token in word_tokenize(text)] |
|
|
|
|
|
english_stopwords= stopwords.words('english') |
|
|
|
|
|
def vectorizer(): |
|
vectorizer= TfidfVectorizer(tokenizer=tokenize, |
|
stop_words=english_stopwords, |
|
) |
|
return vectorizer |
|
|