Spaces:

mstkyvz
/

WebPhishingDetection

Sleeping

App Files Files Community

WebPhishingDetection / feature.py

mstkyvz

Upload 4 files

56ca4c1 verified 4 months ago

raw

history blame contribute delete

16.2 kB

	import ipaddress
	import re
	import urllib.request
	from bs4 import BeautifulSoup
	import socket
	import requests
	from googlesearch import search
	import whois
	from datetime import date, datetime
	import time
	from dateutil.parser import parse as date_parse
	from urllib.parse import urlparse

	class FeatureExtraction:
	features = []
	def __init__(self,url):
	self.features = []
	self.url = url
	self.domain = ""
	self.whois_response = ""
	self.urlparse = ""
	self.response = ""
	self.soup = ""

	try:
	self.response = requests.get(url)
	self.soup = BeautifulSoup(response.text, 'html.parser')
	except:
	pass

	try:
	self.urlparse = urlparse(url)
	self.domain = self.urlparse.netloc
	except:
	pass

	try:
	self.whois_response = whois.whois(self.domain)
	except:
	pass




	self.features.append(self.UsingIp())
	self.features.append(self.longUrl())
	self.features.append(self.shortUrl())
	self.features.append(self.symbol())
	self.features.append(self.redirecting())
	self.features.append(self.prefixSuffix())
	self.features.append(self.SubDomains())
	self.features.append(self.Hppts())
	self.features.append(self.DomainRegLen())
	self.features.append(self.Favicon())


	self.features.append(self.NonStdPort())
	self.features.append(self.HTTPSDomainURL())
	self.features.append(self.RequestURL())
	self.features.append(self.AnchorURL())
	self.features.append(self.LinksInScriptTags())
	self.features.append(self.ServerFormHandler())
	self.features.append(self.InfoEmail())
	self.features.append(self.AbnormalURL())
	self.features.append(self.WebsiteForwarding())
	self.features.append(self.StatusBarCust())

	self.features.append(self.DisableRightClick())
	self.features.append(self.UsingPopupWindow())
	self.features.append(self.IframeRedirection())
	self.features.append(self.AgeofDomain())
	self.features.append(self.DNSRecording())
	self.features.append(self.WebsiteTraffic())
	self.features.append(self.PageRank())
	self.features.append(self.GoogleIndex())
	self.features.append(self.LinksPointingToPage())
	self.features.append(self.StatsReport())


	# 1.UsingIp
	def UsingIp(self):
	try:
	ipaddress.ip_address(self.url)
	return -1
	except:
	return 1

	# 2.longUrl
	def longUrl(self):
	if len(self.url) < 54:
	return 1
	if len(self.url) >= 54 and len(self.url) <= 75:
	return 0
	return -1

	# 3.shortUrl
	def shortUrl(self):
	match = re.search('bit\.ly\|goo\.gl\|shorte\.st\|go2l\.ink\|x\.co\|ow\.ly\|t\.co\|tinyurl\|tr\.im\|is\.gd\|cli\.gs\|'
	'yfrog\.com\|migre\.me\|ff\.im\|tiny\.cc\|url4\.eu\|twit\.ac\|su\.pr\|twurl\.nl\|snipurl\.com\|'
	'short\.to\|BudURL\.com\|ping\.fm\|post\.ly\|Just\.as\|bkite\.com\|snipr\.com\|fic\.kr\|loopt\.us\|'
	'doiop\.com\|short\.ie\|kl\.am\|wp\.me\|rubyurl\.com\|om\.ly\|to\.ly\|bit\.do\|t\.co\|lnkd\.in\|'
	'db\.tt\|qr\.ae\|adf\.ly\|goo\.gl\|bitly\.com\|cur\.lv\|tinyurl\.com\|ow\.ly\|bit\.ly\|ity\.im\|'
	'q\.gs\|is\.gd\|po\.st\|bc\.vc\|twitthis\.com\|u\.to\|j\.mp\|buzurl\.com\|cutt\.us\|u\.bb\|yourls\.org\|'
	'x\.co\|prettylinkpro\.com\|scrnch\.me\|filoops\.info\|vzturl\.com\|qr\.net\|1url\.com\|tweez\.me\|v\.gd\|tr\.im\|link\.zip\.net', self.url)
	if match:
	return -1
	return 1

	# 4.Symbol@
	def symbol(self):
	if re.findall("@",self.url):
	return -1
	return 1

	# 5.Redirecting//
	def redirecting(self):
	if self.url.rfind('//')>6:
	return -1
	return 1

	# 6.prefixSuffix
	def prefixSuffix(self):
	try:
	match = re.findall('\-', self.domain)
	if match:
	return -1
	return 1
	except:
	return -1

	# 7.SubDomains
	def SubDomains(self):
	dot_count = len(re.findall("\.", self.url))
	if dot_count == 1:
	return 1
	elif dot_count == 2:
	return 0
	return -1

	# 8.HTTPS
	def Hppts(self):
	try:
	https = self.urlparse.scheme
	if 'https' in https:
	return 1
	return -1
	except:
	return 1

	# 9.DomainRegLen
	def DomainRegLen(self):
	try:
	expiration_date = self.whois_response.expiration_date
	creation_date = self.whois_response.creation_date
	try:
	if(len(expiration_date)):
	expiration_date = expiration_date[0]
	except:
	pass
	try:
	if(len(creation_date)):
	creation_date = creation_date[0]
	except:
	pass

	age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month)
	if age >=12:
	return 1
	return -1
	except:
	return -1

	# 10. Favicon
	def Favicon(self):
	try:
	for head in self.soup.find_all('head'):
	for head.link in self.soup.find_all('link', href=True):
	dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
	if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
	return 1
	return -1
	except:
	return -1

	# 11. NonStdPort
	def NonStdPort(self):
	try:
	port = self.domain.split(":")
	if len(port)>1:
	return -1
	return 1
	except:
	return -1

	# 12. HTTPSDomainURL
	def HTTPSDomainURL(self):
	try:
	if 'https' in self.domain:
	return -1
	return 1
	except:
	return -1

	# 13. RequestURL
	def RequestURL(self):
	try:
	for img in self.soup.find_all('img', src=True):
	dots = [x.start(0) for x in re.finditer('\.', img['src'])]
	if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1:
	success = success + 1
	i = i+1

	for audio in self.soup.find_all('audio', src=True):
	dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
	if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1:
	success = success + 1
	i = i+1

	for embed in self.soup.find_all('embed', src=True):
	dots = [x.start(0) for x in re.finditer('\.', embed['src'])]
	if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1:
	success = success + 1
	i = i+1

	for iframe in self.soup.find_all('iframe', src=True):
	dots = [x.start(0) for x in re.finditer('\.', iframe['src'])]
	if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1:
	success = success + 1
	i = i+1

	try:
	percentage = success/float(i) * 100
	if percentage < 22.0:
	return 1
	elif((percentage >= 22.0) and (percentage < 61.0)):
	return 0
	else:
	return -1
	except:
	return 0
	except:
	return -1

	# 14. AnchorURL
	def AnchorURL(self):
	try:
	i,unsafe = 0,0
	for a in self.soup.find_all('a', href=True):
	if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']):
	unsafe = unsafe + 1
	i = i + 1

	try:
	percentage = unsafe / float(i) * 100
	if percentage < 31.0:
	return 1
	elif ((percentage >= 31.0) and (percentage < 67.0)):
	return 0
	else:
	return -1
	except:
	return -1

	except:
	return -1

	# 15. LinksInScriptTags
	def LinksInScriptTags(self):
	try:
	i,success = 0,0

	for link in self.soup.find_all('link', href=True):
	dots = [x.start(0) for x in re.finditer('\.', link['href'])]
	if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1:
	success = success + 1
	i = i+1

	for script in self.soup.find_all('script', src=True):
	dots = [x.start(0) for x in re.finditer('\.', script['src'])]
	if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1:
	success = success + 1
	i = i+1

	try:
	percentage = success / float(i) * 100
	if percentage < 17.0:
	return 1
	elif((percentage >= 17.0) and (percentage < 81.0)):
	return 0
	else:
	return -1
	except:
	return 0
	except:
	return -1

	# 16. ServerFormHandler
	def ServerFormHandler(self):
	try:
	if len(self.soup.find_all('form', action=True))==0:
	return 1
	else :
	for form in self.soup.find_all('form', action=True):
	if form['action'] == "" or form['action'] == "about:blank":
	return -1
	elif self.url not in form['action'] and self.domain not in form['action']:
	return 0
	else:
	return 1
	except:
	return -1

	# 17. InfoEmail
	def InfoEmail(self):
	try:
	if re.findall(r"[mail\(\)\|mailto:?]", self.soap):
	return -1
	else:
	return 1
	except:
	return -1

	# 18. AbnormalURL
	def AbnormalURL(self):
	try:
	if self.response.text == self.whois_response:
	return 1
	else:
	return -1
	except:
	return -1

	# 19. WebsiteForwarding
	def WebsiteForwarding(self):
	try:
	if len(self.response.history) <= 1:
	return 1
	elif len(self.response.history) <= 4:
	return 0
	else:
	return -1
	except:
	return -1

	# 20. StatusBarCust
	def StatusBarCust(self):
	try:
	if re.findall("<script>.+onmouseover.+</script>", self.response.text):
	return 1
	else:
	return -1
	except:
	return -1

	# 21. DisableRightClick
	def DisableRightClick(self):
	try:
	if re.findall(r"event.button ?== ?2", self.response.text):
	return 1
	else:
	return -1
	except:
	return -1

	# 22. UsingPopupWindow
	def UsingPopupWindow(self):
	try:
	if re.findall(r"alert\(", self.response.text):
	return 1
	else:
	return -1
	except:
	return -1

	# 23. IframeRedirection
	def IframeRedirection(self):
	try:
	if re.findall(r"[<iframe>\|<frameBorder>]", self.response.text):
	return 1
	else:
	return -1
	except:
	return -1

	# 24. AgeofDomain
	def AgeofDomain(self):
	try:
	creation_date = self.whois_response.creation_date
	try:
	if(len(creation_date)):
	creation_date = creation_date[0]
	except:
	pass

	today = date.today()
	age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
	if age >=6:
	return 1
	return -1
	except:
	return -1

	# 25. DNSRecording
	def DNSRecording(self):
	try:
	creation_date = self.whois_response.creation_date
	try:
	if(len(creation_date)):
	creation_date = creation_date[0]
	except:
	pass

	today = date.today()
	age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
	if age >=6:
	return 1
	return -1
	except:
	return -1

	# 26. WebsiteTraffic
	def WebsiteTraffic(self):
	try:
	rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
	if (int(rank) < 100000):
	return 1
	return 0
	except :
	return -1

	# 27. PageRank
	def PageRank(self):
	try:
	prank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {"name": self.domain})

	global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
	if global_rank > 0 and global_rank < 100000:
	return 1
	return -1
	except:
	return -1


	# 28. GoogleIndex
	def GoogleIndex(self):
	try:
	site = search(self.url, 5)
	if site:
	return 1
	else:
	return -1
	except:
	return 1

	# 29. LinksPointingToPage
	def LinksPointingToPage(self):
	try:
	number_of_links = len(re.findall(r"<a href=", self.response.text))
	if number_of_links == 0:
	return 1
	elif number_of_links <= 2:
	return 0
	else:
	return -1
	except:
	return -1

	# 30. StatsReport
	def StatsReport(self):
	try:
	url_match = re.search(
	'at\.ua\|usa\.cc\|baltazarpresentes\.com\.br\|pe\.hu\|esy\.es\|hol\.es\|sweddy\.com\|myjino\.ru\|96\.lt\|ow\.ly', url)
	ip_address = socket.gethostbyname(self.domain)
	ip_match = re.search('146\.112\.61\.108\|213\.174\.157\.151\|121\.50\.168\.88\|192\.185\.217\.116\|78\.46\.211\.158\|181\.174\.165\.13\|46\.242\.145\.103\|121\.50\.168\.40\|83\.125\.22\.219\|46\.242\.145\.98\|'
	'107\.151\.148\.44\|107\.151\.148\.107\|64\.70\.19\.203\|199\.184\.144\.27\|107\.151\.148\.108\|107\.151\.148\.109\|119\.28\.52\.61\|54\.83\.43\.69\|52\.69\.166\.231\|216\.58\.192\.225\|'
	'118\.184\.25\.86\|67\.208\.74\.71\|23\.253\.126\.58\|104\.239\.157\.210\|175\.126\.123\.219\|141\.8\.224\.221\|10\.10\.10\.10\|43\.229\.108\.32\|103\.232\.215\.140\|69\.172\.201\.153\|'
	'216\.218\.185\.162\|54\.225\.104\.146\|103\.243\.24\.98\|199\.59\.243\.120\|31\.170\.160\.61\|213\.19\.128\.77\|62\.113\.226\.131\|208\.100\.26\.234\|195\.16\.127\.102\|195\.16\.127\.157\|'
	'34\.196\.13\.28\|103\.224\.212\.222\|172\.217\.4\.225\|54\.72\.9\.51\|192\.64\.147\.141\|198\.200\.56\.183\|23\.253\.164\.103\|52\.48\.191\.26\|52\.214\.197\.72\|87\.98\.255\.18\|209\.99\.17\.27\|'
	'216\.38\.62\.18\|104\.130\.124\.96\|47\.89\.58\.141\|78\.46\.211\.158\|54\.86\.225\.156\|54\.82\.156\.19\|37\.157\.192\.102\|204\.11\.56\.48\|110\.34\.231\.42', ip_address)
	if url_match:
	return -1
	elif ip_match:
	return -1
	return 1
	except:
	return 1

	def getFeaturesList(self):
	return self.features