Upload 9 files
Browse files- data_preparing.py +26 -0
- data_splitting.py +12 -0
- model.py +13 -0
- training.py +10 -0
data_preparing.py
CHANGED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
2 |
+
from data_analysis import df
|
3 |
+
from sklearn.preprocessing import LabelEncoder
|
4 |
+
from data_splitting import y_train, y_val
|
5 |
+
import tensorflow as tf
|
6 |
+
|
7 |
+
#Input Variable
|
8 |
+
# vectorizing input varible 'clean_text' into a matrix
|
9 |
+
features = df['clean_text']
|
10 |
+
|
11 |
+
cv = CountVectorizer() # ngram_range=(1,2)
|
12 |
+
features = cv.fit_transform(features)
|
13 |
+
|
14 |
+
# changing the datatype of the number into uint8 to consume less memory
|
15 |
+
features = features.astype('uint8') # uint8 and float32
|
16 |
+
|
17 |
+
|
18 |
+
# defining target variable
|
19 |
+
# using LabelEncoder to get placeholder number values for categorical variabel 'language'
|
20 |
+
le = LabelEncoder()
|
21 |
+
df['language_encoded'] = le.fit_transform(df['language'])
|
22 |
+
|
23 |
+
targets = df['language_encoded']
|
24 |
+
|
25 |
+
y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)
|
26 |
+
y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)
|
data_splitting.py
CHANGED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sklearn.model_selection import train_test_split
|
2 |
+
from data_preparing import features, targets
|
3 |
+
from data_preparing import le
|
4 |
+
# splitting data into training and testing datasets
|
5 |
+
|
6 |
+
X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=2007)
|
7 |
+
|
8 |
+
X_train = X_train.toarray()
|
9 |
+
X_val = X_val.toarray()
|
10 |
+
|
11 |
+
input_size = X_train.shape[1]
|
12 |
+
num_classes = len(le.classes_)
|
model.py
CHANGED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from data_splitting import num_classes, input_size
|
3 |
+
|
4 |
+
|
5 |
+
model = tf.keras.models.Sequential([
|
6 |
+
tf.keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01), input_shape=(input_size,)),
|
7 |
+
tf.keras.layers.BatchNormalization(),
|
8 |
+
tf.keras.layers.Dense(80, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
|
9 |
+
tf.keras.layers.BatchNormalization(),
|
10 |
+
tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
|
11 |
+
tf.keras.layers.BatchNormalization(),
|
12 |
+
tf.keras.layers.Dense(num_classes, activation='softmax')
|
13 |
+
])
|
training.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from model import model
|
2 |
+
from data_splitting import num_classes, X_train, X_val, input_size
|
3 |
+
from data_preparing import y_train_encoded, y_val_encoded
|
4 |
+
from model_callbacks import optimizer, early_stopping, lr_scheduler_callback
|
5 |
+
|
6 |
+
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
|
7 |
+
|
8 |
+
model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_val, y_val_encoded), callbacks=[early_stopping,lr_scheduler_callback])
|
9 |
+
|
10 |
+
model.save('full_language_identifcation_model1.h5')
|