Spaces:

akshatsanghvi
/

spam-email-detection

Sleeping

App Files Files Community

spam-email-detection / URLFeatureExtraction.py

akshatsanghvi

Update file

837d4e1 5 months ago

raw

history blame

6.45 kB

	import urllib
	import ipaddress
	import re
	import socket
	from bs4 import BeautifulSoup
	import whois
	import requests
	import urllib.request
	from urllib.parse import urlparse
	from datetime import datetime

	def havingIP(url):
	try:
	ipaddress.ip_address(url)
	ip = 1
	except:
	ip = 0
	return ip

	def haveAtSign(url):
	if "@" in url:
	at = 1
	else:
	at = 0
	return at


	def getLength(url):
	if len(url) < 54:
	return 1
	else:
	return 0

	def getDepth(url):
	s = urlparse(url).path.split('/')
	depth = 0
	for j in range(len(s)):
	if len(s[j]) != 0:
	depth +=1
	return depth

	def redirection(url):
	pos = url.rfind('//')
	if pos > 6:
	if pos > 7:
	return 1
	else:
	return 0
	else:
	return 0

	def httpDomain(url):
	domain = urlparse(url).netloc
	if 'https' in domain:
	return 1
	else:
	return 0

	shortening_services = r"bit\.ly\|goo\.gl\|shorte\.st\|go2l\.ink\|x\.co\|ow\.ly\|t\.co\|tinyurl\|tr\.im\|is\.gd\|cli\.gs\|" \
	r"yfrog\.com\|migre\.me\|ff\.im\|tiny\.cc\|url4\.eu\|twit\.ac\|su\.pr\|twurl\.nl\|snipurl\.com\|" \
	r"short\.to\|BudURL\.com\|ping\.fm\|post\.ly\|Just\.as\|bkite\.com\|snipr\.com\|fic\.kr\|loopt\.us\|" \
	r"doiop\.com\|short\.ie\|kl\.am\|wp\.me\|rubyurl\.com\|om\.ly\|to\.ly\|bit\.do\|t\.co\|lnkd\.in\|db\.tt\|" \
	r"qr\.ae\|adf\.ly\|goo\.gl\|bitly\.com\|cur\.lv\|tinyurl\.com\|ow\.ly\|bit\.ly\|ity\.im\|q\.gs\|is\.gd\|" \
	r"po\.st\|bc\.vc\|twitthis\.com\|u\.to\|j\.mp\|buzurl\.com\|cutt\.us\|u\.bb\|yourls\.org\|x\.co\|" \
	r"prettylinkpro\.com\|scrnch\.me\|filoops\.info\|vzturl\.com\|qr\.net\|1url\.com\|tweez\.me\|v\.gd\|" \
	r"tr\.im\|link\.zip\.net"

	def tinyURL(url):
	match=re.search(shortening_services,url)
	if match:
	return 1
	else:
	return 0


	def prefixSuffix(url):
	if '-' in urlparse(url).netloc:
	return 1
	else:
	return 0

	def web_traffic(url):
	try:
	query = urllib.parse.quote(url)
	search_url = f"https://www.google.com/search?q=site:{query}"

	headers = {'User-Agent': 'Mozilla/5.0'}
	req = urllib.request.Request(search_url, headers=headers)
	response = urllib.request.urlopen(req).read()
	soup = BeautifulSoup(response, "lxml")

	results = soup.find_all('div', class_='BNeawe')

	for result in results:
	if 'did not match' in result.get_text():
	return 0

	return 1

	except Exception as e:
	print(f"Error: {e}")
	return 0

	def domainAge(domain_name):
	creation_date = domain_name.creation_date
	expiration_date = domain_name.expiration_date
	if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
	try:
	creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
	expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
	except:
	return 1
	if ((expiration_date is None) or (creation_date is None)):
	return 1
	elif ((type(expiration_date) is list) or (type(creation_date) is list)):
	return 1
	else:
	ageofdomain = abs((expiration_date - creation_date).days)
	if ((ageofdomain/30) < 6):
	age = 1
	else:
	age = 0
	return age

	def domainEnd(domain_name):
	expiration_date = domain_name.expiration_date
	if isinstance(expiration_date,str):
	try:
	expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
	except:
	return 1
	if (expiration_date is None):
	return 1
	elif (type(expiration_date) is list):
	return 1
	else:
	today = datetime.now()
	end = abs((expiration_date - today).days)
	if ((end/30) < 6):
	end = 0
	else:
	end = 1
	return end

	def iframe(response):
	if response == "":
	return 1
	else:
	if re.findall(r"[<iframe>\|<frameBorder>]", response.text):
	return 0
	else:
	return 1

	def mouseOver(response):
	if response == "" :
	return 1
	else:
	if re.findall("<script>.+onmouseover.+</script>", response.text):
	return 1
	else:
	return 0

	def rightClick(response):
	if response == "":
	return 1
	else:
	if re.findall(r"event.button ?== ?2", response.text):
	return 0
	else:
	return 1

	def forwarding(response):
	if response == "":
	return 1
	else:
	if len(response.history) <= 2:
	return 0
	else:
	return 1

	state = 0
	def featureExtraction(url):

	new_url = url
	try:
	response = requests.get(new_url)

	except:
	try:
	new_url = 'https://' + url
	response = requests.get(new_url)

	except:
	try:
	new_url = 'http://' + url
	response = requests.get(new_url)

	except:
	response = ""

	url = new_url
	print("URL", url)

	features = []
	features.append(havingIP(url))
	features.append(haveAtSign(url))
	features.append(getLength(url))
	features.append(getDepth(url))
	features.append(redirection(url))
	features.append(httpDomain(url))
	features.append(tinyURL(url))
	features.append(prefixSuffix(url))

	try:
	global state

	domain_name = whois.whois(urlparse(url).netloc)

	if domain_name.get('domain_name'):
	state = 0

	else:
	state = 1
	dns = 0 if socket.gethostbyname(domain_name.domain_name[0]) else 1
	except:
	dns = 1

	features.append(dns)
	features.append(web_traffic(url))
	features.append(1 if dns == 1 else domainAge(domain_name))
	features.append(1 if dns == 1 else domainEnd(domain_name))

	features.append(iframe(response))
	features.append(mouseOver(response))
	features.append(rightClick(response))
	features.append(forwarding(response))

	return features

	feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
	'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic',
	'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

	# I @ L D R D t P D T A E i M R F L
	# . . . . . .

	# 0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0 0
	# 0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0 Y
	# 0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0 -

	# . .
	# 0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1 0
	# 0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0
	# 0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0 -

	# 0,0,1,3,0,0,0,0,0,0,1,1,0,0,1,0 1
	# 0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0
	# 0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0 -

	# Prints : site. history. array. pred.