|
from sklearn.feature_extraction.text import CountVectorizer |
|
from data_analysis import df |
|
from sklearn.preprocessing import LabelEncoder |
|
from data_splitting import y_train, y_val |
|
import tensorflow as tf |
|
|
|
|
|
|
|
features = df['clean_text'] |
|
|
|
cv = CountVectorizer() |
|
features = cv.fit_transform(features) |
|
|
|
|
|
features = features.astype('uint8') |
|
|
|
|
|
|
|
|
|
le = LabelEncoder() |
|
df['language_encoded'] = le.fit_transform(df['language']) |
|
|
|
targets = df['language_encoded'] |
|
|
|
y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22) |
|
y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22) |