File size: 887 Bytes
c5c7499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
from sklearn.feature_extraction.text import CountVectorizer
from data_analysis import df
from sklearn.preprocessing import LabelEncoder
from data_splitting import y_train, y_val
import tensorflow as tf

#Input Variable
# vectorizing input varible 'clean_text' into a matrix 
features = df['clean_text']

cv = CountVectorizer() # ngram_range=(1,2)
features = cv.fit_transform(features)

# changing the datatype of the number into uint8 to consume less memory
features = features.astype('uint8') # uint8 and float32


# defining target variable
# using LabelEncoder to get placeholder number values for categorical variabel 'language'
le = LabelEncoder()
df['language_encoded'] = le.fit_transform(df['language'])

targets = df['language_encoded']

y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)
y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)