Spaces:
Sleeping
Sleeping
import urllib | |
import ipaddress | |
import re | |
import socket | |
from bs4 import BeautifulSoup | |
import whois | |
import requests | |
import urllib.request | |
from urllib.parse import urlparse | |
from datetime import datetime | |
def havingIP(url): | |
try: | |
ipaddress.ip_address(url) | |
ip = 1 | |
except: | |
ip = 0 | |
return ip | |
def haveAtSign(url): | |
if "@" in url: | |
at = 1 | |
else: | |
at = 0 | |
return at | |
def getLength(url): | |
if len(url) < 54: | |
return 1 | |
else: | |
return 0 | |
def getDepth(url): | |
s = urlparse(url).path.split('/') | |
depth = 0 | |
for j in range(len(s)): | |
if len(s[j]) != 0: | |
depth +=1 | |
return depth | |
def redirection(url): | |
pos = url.rfind('//') | |
if pos > 6: | |
if pos > 7: | |
return 1 | |
else: | |
return 0 | |
else: | |
return 0 | |
def httpDomain(url): | |
domain = urlparse(url).netloc | |
if 'https' in domain: | |
return 1 | |
else: | |
return 0 | |
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \ | |
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \ | |
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \ | |
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \ | |
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \ | |
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \ | |
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \ | |
r"tr\.im|link\.zip\.net" | |
def tinyURL(url): | |
match=re.search(shortening_services,url) | |
if match: | |
return 1 | |
else: | |
return 0 | |
def prefixSuffix(url): | |
if '-' in urlparse(url).netloc: | |
return 1 | |
else: | |
return 0 | |
def web_traffic(url): | |
try: | |
query = urllib.parse.quote(url) | |
search_url = f"https://www.google.com/search?q=site:{query}" | |
headers = {'User-Agent': 'Mozilla/5.0'} | |
req = urllib.request.Request(search_url, headers=headers) | |
response = urllib.request.urlopen(req).read() | |
soup = BeautifulSoup(response, "lxml") | |
results = soup.find_all('div', class_='BNeawe') | |
for result in results: | |
if 'did not match' in result.get_text(): | |
return 0 | |
return 1 | |
except Exception as e: | |
print(f"Error: {e}") | |
return 0 | |
def domainAge(domain_name): | |
creation_date = domain_name.creation_date | |
expiration_date = domain_name.expiration_date | |
if (isinstance(creation_date,str) or isinstance(expiration_date,str)): | |
try: | |
creation_date = datetime.strptime(creation_date,'%Y-%m-%d') | |
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") | |
except: | |
return 1 | |
if ((expiration_date is None) or (creation_date is None)): | |
return 1 | |
elif ((type(expiration_date) is list) or (type(creation_date) is list)): | |
return 1 | |
else: | |
ageofdomain = abs((expiration_date - creation_date).days) | |
if ((ageofdomain/30) < 6): | |
age = 1 | |
else: | |
age = 0 | |
return age | |
def domainEnd(domain_name): | |
expiration_date = domain_name.expiration_date | |
if isinstance(expiration_date,str): | |
try: | |
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") | |
except: | |
return 1 | |
if (expiration_date is None): | |
return 1 | |
elif (type(expiration_date) is list): | |
return 1 | |
else: | |
today = datetime.now() | |
end = abs((expiration_date - today).days) | |
if ((end/30) < 6): | |
end = 0 | |
else: | |
end = 1 | |
return end | |
def iframe(response): | |
if response == "": | |
return 1 | |
else: | |
if re.findall(r"[<iframe>|<frameBorder>]", response.text): | |
return 0 | |
else: | |
return 1 | |
def mouseOver(response): | |
if response == "" : | |
return 1 | |
else: | |
if re.findall("<script>.+onmouseover.+</script>", response.text): | |
return 1 | |
else: | |
return 0 | |
def rightClick(response): | |
if response == "": | |
return 1 | |
else: | |
if re.findall(r"event.button ?== ?2", response.text): | |
return 0 | |
else: | |
return 1 | |
def forwarding(response): | |
if response == "": | |
return 1 | |
else: | |
if len(response.history) <= 2: | |
return 0 | |
else: | |
return 1 | |
state = 0 | |
def featureExtraction(url): | |
new_url = url | |
try: | |
response = requests.get(new_url) | |
except: | |
try: | |
new_url = 'https://' + url | |
response = requests.get(new_url) | |
except: | |
try: | |
new_url = 'http://' + url | |
response = requests.get(new_url) | |
except: | |
response = "" | |
url = new_url | |
print("URL", url) | |
features = [] | |
features.append(havingIP(url)) | |
features.append(haveAtSign(url)) | |
features.append(getLength(url)) | |
features.append(getDepth(url)) | |
features.append(redirection(url)) | |
features.append(httpDomain(url)) | |
features.append(tinyURL(url)) | |
features.append(prefixSuffix(url)) | |
try: | |
global state | |
domain_name = whois.whois(urlparse(url).netloc) | |
if domain_name.get('domain_name'): | |
state = 0 | |
else: | |
state = 1 | |
dns = 0 if socket.gethostbyname(domain_name.domain_name[0]) else 1 | |
except: | |
dns = 1 | |
features.append(dns) | |
features.append(web_traffic(url)) | |
features.append(1 if dns == 1 else domainAge(domain_name)) | |
features.append(1 if dns == 1 else domainEnd(domain_name)) | |
features.append(iframe(response)) | |
features.append(mouseOver(response)) | |
features.append(rightClick(response)) | |
features.append(forwarding(response)) | |
return features | |
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', | |
'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', | |
'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label'] | |
# I @ L D R D t P D T A E i M R F L | |
# . . . . . . | |
# 0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0 0 | |
# 0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0 Y | |
# 0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0 - | |
# . . | |
# 0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1 0 | |
# 0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0 | |
# 0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0 - | |
# 0,0,1,3,0,0,0,0,0,0,1,1,0,0,1,0 1 | |
# 0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0 | |
# 0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0 - | |
# Prints : site. history. array. pred. |