import ipaddress
import re
import urllib.request
from bs4 import BeautifulSoup
import socket
import requests
from googlesearch import search
import whois
from datetime import date, datetime
import time
from dateutil.parser import parse as date_parse
from urllib.parse import urlparse
class FeatureExtraction:
features = []
def __init__(self,url):
self.features = []
self.url = url
self.domain = ""
self.whois_response = ""
self.urlparse = ""
self.response = ""
self.soup = ""
try:
self.response = requests.get(url)
self.soup = BeautifulSoup(response.text, 'html.parser')
except:
pass
try:
self.urlparse = urlparse(url)
self.domain = self.urlparse.netloc
except:
pass
try:
self.whois_response = whois.whois(self.domain)
except:
pass
self.features.append(self.UsingIp())
self.features.append(self.longUrl())
self.features.append(self.shortUrl())
self.features.append(self.symbol())
self.features.append(self.redirecting())
self.features.append(self.prefixSuffix())
self.features.append(self.SubDomains())
self.features.append(self.Hppts())
self.features.append(self.DomainRegLen())
self.features.append(self.Favicon())
self.features.append(self.NonStdPort())
self.features.append(self.HTTPSDomainURL())
self.features.append(self.RequestURL())
self.features.append(self.AnchorURL())
self.features.append(self.LinksInScriptTags())
self.features.append(self.ServerFormHandler())
self.features.append(self.InfoEmail())
self.features.append(self.AbnormalURL())
self.features.append(self.WebsiteForwarding())
self.features.append(self.StatusBarCust())
self.features.append(self.DisableRightClick())
self.features.append(self.UsingPopupWindow())
self.features.append(self.IframeRedirection())
self.features.append(self.AgeofDomain())
self.features.append(self.DNSRecording())
self.features.append(self.WebsiteTraffic())
self.features.append(self.PageRank())
self.features.append(self.GoogleIndex())
self.features.append(self.LinksPointingToPage())
self.features.append(self.StatsReport())
# 1.UsingIp
def UsingIp(self):
try:
ipaddress.ip_address(self.url)
return -1
except:
return 1
# 2.longUrl
def longUrl(self):
if len(self.url) < 54:
return 1
if len(self.url) >= 54 and len(self.url) <= 75:
return 0
return -1
# 3.shortUrl
def shortUrl(self):
match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', self.url)
if match:
return -1
return 1
# 4.Symbol@
def symbol(self):
if re.findall("@",self.url):
return -1
return 1
# 5.Redirecting//
def redirecting(self):
if self.url.rfind('//')>6:
return -1
return 1
# 6.prefixSuffix
def prefixSuffix(self):
try:
match = re.findall('\-', self.domain)
if match:
return -1
return 1
except:
return -1
# 7.SubDomains
def SubDomains(self):
dot_count = len(re.findall("\.", self.url))
if dot_count == 1:
return 1
elif dot_count == 2:
return 0
return -1
# 8.HTTPS
def Hppts(self):
try:
https = self.urlparse.scheme
if 'https' in https:
return 1
return -1
except:
return 1
# 9.DomainRegLen
def DomainRegLen(self):
try:
expiration_date = self.whois_response.expiration_date
creation_date = self.whois_response.creation_date
try:
if(len(expiration_date)):
expiration_date = expiration_date[0]
except:
pass
try:
if(len(creation_date)):
creation_date = creation_date[0]
except:
pass
age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month)
if age >=12:
return 1
return -1
except:
return -1
# 10. Favicon
def Favicon(self):
try:
for head in self.soup.find_all('head'):
for head.link in self.soup.find_all('link', href=True):
dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
return 1
return -1
except:
return -1
# 11. NonStdPort
def NonStdPort(self):
try:
port = self.domain.split(":")
if len(port)>1:
return -1
return 1
except:
return -1
# 12. HTTPSDomainURL
def HTTPSDomainURL(self):
try:
if 'https' in self.domain:
return -1
return 1
except:
return -1
# 13. RequestURL
def RequestURL(self):
try:
for img in self.soup.find_all('img', src=True):
dots = [x.start(0) for x in re.finditer('\.', img['src'])]
if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1:
success = success + 1
i = i+1
for audio in self.soup.find_all('audio', src=True):
dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1:
success = success + 1
i = i+1
for embed in self.soup.find_all('embed', src=True):
dots = [x.start(0) for x in re.finditer('\.', embed['src'])]
if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1:
success = success + 1
i = i+1
for iframe in self.soup.find_all('iframe', src=True):
dots = [x.start(0) for x in re.finditer('\.', iframe['src'])]
if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1:
success = success + 1
i = i+1
try:
percentage = success/float(i) * 100
if percentage < 22.0:
return 1
elif((percentage >= 22.0) and (percentage < 61.0)):
return 0
else:
return -1
except:
return 0
except:
return -1
# 14. AnchorURL
def AnchorURL(self):
try:
i,unsafe = 0,0
for a in self.soup.find_all('a', href=True):
if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']):
unsafe = unsafe + 1
i = i + 1
try:
percentage = unsafe / float(i) * 100
if percentage < 31.0:
return 1
elif ((percentage >= 31.0) and (percentage < 67.0)):
return 0
else:
return -1
except:
return -1
except:
return -1
# 15. LinksInScriptTags
def LinksInScriptTags(self):
try:
i,success = 0,0
for link in self.soup.find_all('link', href=True):
dots = [x.start(0) for x in re.finditer('\.', link['href'])]
if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1:
success = success + 1
i = i+1
for script in self.soup.find_all('script', src=True):
dots = [x.start(0) for x in re.finditer('\.', script['src'])]
if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1:
success = success + 1
i = i+1
try:
percentage = success / float(i) * 100
if percentage < 17.0:
return 1
elif((percentage >= 17.0) and (percentage < 81.0)):
return 0
else:
return -1
except:
return 0
except:
return -1
# 16. ServerFormHandler
def ServerFormHandler(self):
try:
if len(self.soup.find_all('form', action=True))==0:
return 1
else :
for form in self.soup.find_all('form', action=True):
if form['action'] == "" or form['action'] == "about:blank":
return -1
elif self.url not in form['action'] and self.domain not in form['action']:
return 0
else:
return 1
except:
return -1
# 17. InfoEmail
def InfoEmail(self):
try:
if re.findall(r"[mail\(\)|mailto:?]", self.soap):
return -1
else:
return 1
except:
return -1
# 18. AbnormalURL
def AbnormalURL(self):
try:
if self.response.text == self.whois_response:
return 1
else:
return -1
except:
return -1
# 19. WebsiteForwarding
def WebsiteForwarding(self):
try:
if len(self.response.history) <= 1:
return 1
elif len(self.response.history) <= 4:
return 0
else:
return -1
except:
return -1
# 20. StatusBarCust
def StatusBarCust(self):
try:
if re.findall("", self.response.text):
return 1
else:
return -1
except:
return -1
# 21. DisableRightClick
def DisableRightClick(self):
try:
if re.findall(r"event.button ?== ?2", self.response.text):
return 1
else:
return -1
except:
return -1
# 22. UsingPopupWindow
def UsingPopupWindow(self):
try:
if re.findall(r"alert\(", self.response.text):
return 1
else:
return -1
except:
return -1
# 23. IframeRedirection
def IframeRedirection(self):
try:
if re.findall(r"[