web-phishing-detection

Running

App Files Files Community

rmdhirr commited on Jun 15, 2024

Commit

b1ddb38

verified ·

1 Parent(s): b5b6842

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -20

app.py CHANGED Viewed

@@ -1,18 +1,13 @@
 import gradio as gr
 import tensorflow as tf
-import pickle
 import numpy as np
-from sklearn.preprocessing import LabelEncoder
-# Load saved components
-with open('preprocessing_params.pkl', 'rb') as f:
-    preprocessing_params = pickle.load(f)
-with open('label_encoder.pkl', 'rb') as f:
-    label_encoder = pickle.load(f)
-with open('url_tokenizer.pkl', 'rb') as f:
-    url_tokenizer = pickle.load(f)
-with open('html_tokenizer.pkl', 'rb') as f:
-    html_tokenizer = pickle.load(f)
 # Load the model
 model = tf.keras.models.load_model('new_phishing_detection_model.keras')
@@ -22,26 +17,66 @@ model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
               loss='binary_crossentropy',
               metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
-# Function to preprocess input
 def preprocess_input(input_text, tokenizer, max_length):
     sequences = tokenizer.texts_to_sequences([input_text])
-    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
     return padded_sequences
-# Function to get prediction
 def get_prediction(input_text, input_type):
     is_url = input_type == "URL"
     if is_url:
-        input_data = preprocess_input(input_text, url_tokenizer, preprocessing_params['max_url_length'])
-        input_data = [input_data, np.zeros((1, preprocessing_params['max_html_length']))]  # dummy HTML input
     else:
-        input_data = preprocess_input(input_text, html_tokenizer, preprocessing_params['max_html_length'])
-        input_data = [np.zeros((1, preprocessing_params['max_url_length'])), input_data]  # dummy URL input
     prediction = model.predict(input_data)[0][0]
     return prediction
-# Gradio UI
 def phishing_detection(input_text, input_type):
     prediction = get_prediction(input_text, input_type)
     if prediction > 0.5:

 import gradio as gr
 import tensorflow as tf
 import numpy as np
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.preprocessing.text import Tokenizer
+import re
 # Load the model
 model = tf.keras.models.load_model('new_phishing_detection_model.keras')
               loss='binary_crossentropy',
               metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
+# Preprocessing functions
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+STOPWORDS = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+def preprocess_url(url):
+    url = url.lower()
+    url = re.sub(r'https?://', '', url)
+    url = re.sub(r'www\.', '', url)
+    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
+    url = re.sub(r'\s+', ' ', url).strip()
+    tokens = word_tokenize(url)
+    tokens = [word for word in tokens if word not in STOPWORDS]
+    tokens = [lemmatizer.lemmatize(word) for word in tokens]
+    return ' '.join(tokens)
+def preprocess_html(html):
+    html = re.sub(r'<[^>]+>', ' ', html)
+    html = html.lower()
+    html = re.sub(r'https?://', '', html)
+    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
+    html = re.sub(r'\s+', ' ', html).strip()
+    tokens = word_tokenize(html)
+    tokens = [word for word in tokens if word not in STOPWORDS]
+    tokens = [lemmatizer.lemmatize(word) for word in tokens]
+    return ' '.join(tokens)
+max_url_length = 180
+max_html_length = 2000
+max_words = 10000
+url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
+html_tokenizer = Tokenizer(num_words=max_words)
+# Dummy fit to initialize tokenizers
+url_tokenizer.fit_on_texts(["dummy"])
+html_tokenizer.fit_on_texts(["dummy"])
 def preprocess_input(input_text, tokenizer, max_length):
     sequences = tokenizer.texts_to_sequences([input_text])
+    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
     return padded_sequences
 def get_prediction(input_text, input_type):
     is_url = input_type == "URL"
     if is_url:
+        cleaned_text = preprocess_url(input_text)
+        input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
+        input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
     else:
+        cleaned_text = preprocess_html(input_text)
+        input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
+        input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
     prediction = model.predict(input_data)[0][0]
     return prediction
 def phishing_detection(input_text, input_type):
     prediction = get_prediction(input_text, input_type)
     if prediction > 0.5: