web-phishing-detection

Running

File size: 3,552 Bytes

eb30cad
 
 
b1ddb38
 
 
 
 
 
 
eb30cad
2f8164c
 
eb30cad
2f8164c
eb30cad
2f8164c
eb30cad
 
b1ddb38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb30cad
 
b1ddb38
eb30cad
 
 
 
 
b1ddb38
 
 
eb30cad
b1ddb38
 
 
eb30cad
3a6bb00
eb30cad
 
 
 
 
 
 
 
 
 
120d185
3d7830a
26ce0ac
 
3d7830a
120d185
2f8164c
120d185
eb30cad

import gradio as gr
import tensorflow as tf
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

# Load the model
model = tf.keras.models.load_model('new_phishing_detection_model.keras')

# Compile the model with standard loss and metrics
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='binary_crossentropy',
              metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

# Preprocessing functions
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_url(url):
    url = url.lower()
    url = re.sub(r'https?://', '', url)
    url = re.sub(r'www\.', '', url)
    url = re.sub(r'[^a-zA-Z0-9]', ' ', url)
    url = re.sub(r'\s+', ' ', url).strip()
    tokens = word_tokenize(url)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_html(html):
    html = re.sub(r'<[^>]+>', ' ', html)
    html = html.lower()
    html = re.sub(r'https?://', '', html)
    html = re.sub(r'[^a-zA-Z0-9]', ' ', html)
    html = re.sub(r'\s+', ' ', html).strip()
    tokens = word_tokenize(html)
    tokens = [word for word in tokens if word not in STOPWORDS]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

max_url_length = 180
max_html_length = 2000
max_words = 10000

url_tokenizer = Tokenizer(num_words=max_words, char_level=True)
html_tokenizer = Tokenizer(num_words=max_words)

# Dummy fit to initialize tokenizers
url_tokenizer.fit_on_texts(["dummy"])
html_tokenizer.fit_on_texts(["dummy"])

def preprocess_input(input_text, tokenizer, max_length):
    sequences = tokenizer.texts_to_sequences([input_text])
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences

def get_prediction(input_text, input_type):
    is_url = input_type == "URL"
    if is_url:
        cleaned_text = preprocess_url(input_text)
        input_data = preprocess_input(cleaned_text, url_tokenizer, max_url_length)
        input_data = [input_data, np.zeros((1, max_html_length))]  # dummy HTML input
    else:
        cleaned_text = preprocess_html(input_text)
        input_data = preprocess_input(cleaned_text, html_tokenizer, max_html_length)
        input_data = [np.zeros((1, max_url_length)), input_data]  # dummy URL input
    
    prediction = model.predict(input_data)[0][0]
    return prediction

def phishing_detection(input_text, input_type):
    prediction = get_prediction(input_text, input_type)
    if prediction > 0.5:
        return f"Warning: This site is likely a phishing site! ({prediction:.2f})"
    else:
        return f"Safe: This site is not likely a phishing site. ({prediction:.2f})"

iface = gr.Interface(
    fn=phishing_detection,
    inputs=[
        gr.components.Textbox(lines=5, placeholder="Enter URL or HTML code"), 
        gr.components.Radio(["URL", "HTML"], type="value", label="Input Type")
    ],
    outputs=gr.components.Textbox(label="Phishing Detection Result"),
    title="Phishing Detection Model",
    description="Check if a URL or HTML is Phishing.",
    theme="default"
)

iface.launch()