hassaanik commited on
Commit
c5c7499
1 Parent(s): f01ad32

Upload 9 files

Browse files
Files changed (4) hide show
  1. data_preparing.py +26 -0
  2. data_splitting.py +12 -0
  3. model.py +13 -0
  4. training.py +10 -0
data_preparing.py CHANGED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.feature_extraction.text import CountVectorizer
2
+ from data_analysis import df
3
+ from sklearn.preprocessing import LabelEncoder
4
+ from data_splitting import y_train, y_val
5
+ import tensorflow as tf
6
+
7
+ #Input Variable
8
+ # vectorizing input varible 'clean_text' into a matrix
9
+ features = df['clean_text']
10
+
11
+ cv = CountVectorizer() # ngram_range=(1,2)
12
+ features = cv.fit_transform(features)
13
+
14
+ # changing the datatype of the number into uint8 to consume less memory
15
+ features = features.astype('uint8') # uint8 and float32
16
+
17
+
18
+ # defining target variable
19
+ # using LabelEncoder to get placeholder number values for categorical variabel 'language'
20
+ le = LabelEncoder()
21
+ df['language_encoded'] = le.fit_transform(df['language'])
22
+
23
+ targets = df['language_encoded']
24
+
25
+ y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)
26
+ y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)
data_splitting.py CHANGED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.model_selection import train_test_split
2
+ from data_preparing import features, targets
3
+ from data_preparing import le
4
+ # splitting data into training and testing datasets
5
+
6
+ X_train, X_val, y_train, y_val = train_test_split(features, targets, test_size=0.2, random_state=2007)
7
+
8
+ X_train = X_train.toarray()
9
+ X_val = X_val.toarray()
10
+
11
+ input_size = X_train.shape[1]
12
+ num_classes = len(le.classes_)
model.py CHANGED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+ from data_splitting import num_classes, input_size
3
+
4
+
5
+ model = tf.keras.models.Sequential([
6
+ tf.keras.layers.Dense(100, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01), input_shape=(input_size,)),
7
+ tf.keras.layers.BatchNormalization(),
8
+ tf.keras.layers.Dense(80, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
9
+ tf.keras.layers.BatchNormalization(),
10
+ tf.keras.layers.Dense(50, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
11
+ tf.keras.layers.BatchNormalization(),
12
+ tf.keras.layers.Dense(num_classes, activation='softmax')
13
+ ])
training.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from model import model
2
+ from data_splitting import num_classes, X_train, X_val, input_size
3
+ from data_preparing import y_train_encoded, y_val_encoded
4
+ from model_callbacks import optimizer, early_stopping, lr_scheduler_callback
5
+
6
+ model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
7
+
8
+ model.fit(X_train, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_val, y_val_encoded), callbacks=[early_stopping,lr_scheduler_callback])
9
+
10
+ model.save('full_language_identifcation_model1.h5')