import nltk import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, Dense from tensorflow.keras.layers import LSTM, Activation, SpatialDropout1D from tensorflow.keras.preprocessing.text import one_hot from tensorflow.keras.preprocessing.sequence import pad_sequences from sklearn.model_selection import train_test_split from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import matplotlib.pyplot as plt import pandas as pd import numpy as np import re train_dir = "data/train.csv" df = pd.read_csv(train_dir) df = df.dropna() df = df.reset_index() X = df.drop(labels=['label', 'id'], axis=1) y = df['label'] xdata = X.copy() xdata.reset_index(inplace=True) lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') xtitle = [] for i in range(len(xdata)): sent = re.sub('[^a-zA-Z]', ' ', xdata['title'][i]) sent = sent.lower().split() sent = [lemmatizer.lemmatize(word) for word in sent if word not in set(stop_words)] sent = " ".join(sent) xtitle.append(sent) vocab_size = 5000 embedding_feature_len = 30 max_sent_len = 20 batch_size = 32 epochs = 10 one_hot_representation = [one_hot(words, vocab_size) for words in xtitle] padded_sequences = pad_sequences(one_hot_representation, truncating="post", padding="post", maxlen=max_sent_len) X = np.array(padded_sequences) y = np.array(y) x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) model = Sequential() model.add(Embedding(vocab_size, embedding_feature_len, input_length=max_sent_len)) model.add(SpatialDropout1D(rate=0.2)) model.add(LSTM(units=128)) model.add(Dense(units=1)) model.add(Activation("sigmoid")) model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) print(model.summary()) hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test)) y_pred = model.predict_classes(x_test)