Spaces:
Sleeping
Sleeping
akshatsanghvi
commited on
Commit
•
837d4e1
1
Parent(s):
feced93
Update file
Browse files- URLFeatureExtraction.py +260 -0
URLFeatureExtraction.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import urllib
|
2 |
+
import ipaddress
|
3 |
+
import re
|
4 |
+
import socket
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
import whois
|
7 |
+
import requests
|
8 |
+
import urllib.request
|
9 |
+
from urllib.parse import urlparse
|
10 |
+
from datetime import datetime
|
11 |
+
|
12 |
+
def havingIP(url):
|
13 |
+
try:
|
14 |
+
ipaddress.ip_address(url)
|
15 |
+
ip = 1
|
16 |
+
except:
|
17 |
+
ip = 0
|
18 |
+
return ip
|
19 |
+
|
20 |
+
def haveAtSign(url):
|
21 |
+
if "@" in url:
|
22 |
+
at = 1
|
23 |
+
else:
|
24 |
+
at = 0
|
25 |
+
return at
|
26 |
+
|
27 |
+
|
28 |
+
def getLength(url):
|
29 |
+
if len(url) < 54:
|
30 |
+
return 1
|
31 |
+
else:
|
32 |
+
return 0
|
33 |
+
|
34 |
+
def getDepth(url):
|
35 |
+
s = urlparse(url).path.split('/')
|
36 |
+
depth = 0
|
37 |
+
for j in range(len(s)):
|
38 |
+
if len(s[j]) != 0:
|
39 |
+
depth +=1
|
40 |
+
return depth
|
41 |
+
|
42 |
+
def redirection(url):
|
43 |
+
pos = url.rfind('//')
|
44 |
+
if pos > 6:
|
45 |
+
if pos > 7:
|
46 |
+
return 1
|
47 |
+
else:
|
48 |
+
return 0
|
49 |
+
else:
|
50 |
+
return 0
|
51 |
+
|
52 |
+
def httpDomain(url):
|
53 |
+
domain = urlparse(url).netloc
|
54 |
+
if 'https' in domain:
|
55 |
+
return 1
|
56 |
+
else:
|
57 |
+
return 0
|
58 |
+
|
59 |
+
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
|
60 |
+
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
|
61 |
+
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
|
62 |
+
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
|
63 |
+
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
|
64 |
+
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
|
65 |
+
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
|
66 |
+
r"tr\.im|link\.zip\.net"
|
67 |
+
|
68 |
+
def tinyURL(url):
|
69 |
+
match=re.search(shortening_services,url)
|
70 |
+
if match:
|
71 |
+
return 1
|
72 |
+
else:
|
73 |
+
return 0
|
74 |
+
|
75 |
+
|
76 |
+
def prefixSuffix(url):
|
77 |
+
if '-' in urlparse(url).netloc:
|
78 |
+
return 1
|
79 |
+
else:
|
80 |
+
return 0
|
81 |
+
|
82 |
+
def web_traffic(url):
|
83 |
+
try:
|
84 |
+
query = urllib.parse.quote(url)
|
85 |
+
search_url = f"https://www.google.com/search?q=site:{query}"
|
86 |
+
|
87 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
88 |
+
req = urllib.request.Request(search_url, headers=headers)
|
89 |
+
response = urllib.request.urlopen(req).read()
|
90 |
+
soup = BeautifulSoup(response, "lxml")
|
91 |
+
|
92 |
+
results = soup.find_all('div', class_='BNeawe')
|
93 |
+
|
94 |
+
for result in results:
|
95 |
+
if 'did not match' in result.get_text():
|
96 |
+
return 0
|
97 |
+
|
98 |
+
return 1
|
99 |
+
|
100 |
+
except Exception as e:
|
101 |
+
print(f"Error: {e}")
|
102 |
+
return 0
|
103 |
+
|
104 |
+
def domainAge(domain_name):
|
105 |
+
creation_date = domain_name.creation_date
|
106 |
+
expiration_date = domain_name.expiration_date
|
107 |
+
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
|
108 |
+
try:
|
109 |
+
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
|
110 |
+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
|
111 |
+
except:
|
112 |
+
return 1
|
113 |
+
if ((expiration_date is None) or (creation_date is None)):
|
114 |
+
return 1
|
115 |
+
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
|
116 |
+
return 1
|
117 |
+
else:
|
118 |
+
ageofdomain = abs((expiration_date - creation_date).days)
|
119 |
+
if ((ageofdomain/30) < 6):
|
120 |
+
age = 1
|
121 |
+
else:
|
122 |
+
age = 0
|
123 |
+
return age
|
124 |
+
|
125 |
+
def domainEnd(domain_name):
|
126 |
+
expiration_date = domain_name.expiration_date
|
127 |
+
if isinstance(expiration_date,str):
|
128 |
+
try:
|
129 |
+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
|
130 |
+
except:
|
131 |
+
return 1
|
132 |
+
if (expiration_date is None):
|
133 |
+
return 1
|
134 |
+
elif (type(expiration_date) is list):
|
135 |
+
return 1
|
136 |
+
else:
|
137 |
+
today = datetime.now()
|
138 |
+
end = abs((expiration_date - today).days)
|
139 |
+
if ((end/30) < 6):
|
140 |
+
end = 0
|
141 |
+
else:
|
142 |
+
end = 1
|
143 |
+
return end
|
144 |
+
|
145 |
+
def iframe(response):
|
146 |
+
if response == "":
|
147 |
+
return 1
|
148 |
+
else:
|
149 |
+
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
|
150 |
+
return 0
|
151 |
+
else:
|
152 |
+
return 1
|
153 |
+
|
154 |
+
def mouseOver(response):
|
155 |
+
if response == "" :
|
156 |
+
return 1
|
157 |
+
else:
|
158 |
+
if re.findall("<script>.+onmouseover.+</script>", response.text):
|
159 |
+
return 1
|
160 |
+
else:
|
161 |
+
return 0
|
162 |
+
|
163 |
+
def rightClick(response):
|
164 |
+
if response == "":
|
165 |
+
return 1
|
166 |
+
else:
|
167 |
+
if re.findall(r"event.button ?== ?2", response.text):
|
168 |
+
return 0
|
169 |
+
else:
|
170 |
+
return 1
|
171 |
+
|
172 |
+
def forwarding(response):
|
173 |
+
if response == "":
|
174 |
+
return 1
|
175 |
+
else:
|
176 |
+
if len(response.history) <= 2:
|
177 |
+
return 0
|
178 |
+
else:
|
179 |
+
return 1
|
180 |
+
|
181 |
+
state = 0
|
182 |
+
def featureExtraction(url):
|
183 |
+
|
184 |
+
new_url = url
|
185 |
+
try:
|
186 |
+
response = requests.get(new_url)
|
187 |
+
|
188 |
+
except:
|
189 |
+
try:
|
190 |
+
new_url = 'https://' + url
|
191 |
+
response = requests.get(new_url)
|
192 |
+
|
193 |
+
except:
|
194 |
+
try:
|
195 |
+
new_url = 'http://' + url
|
196 |
+
response = requests.get(new_url)
|
197 |
+
|
198 |
+
except:
|
199 |
+
response = ""
|
200 |
+
|
201 |
+
url = new_url
|
202 |
+
print("URL", url)
|
203 |
+
|
204 |
+
features = []
|
205 |
+
features.append(havingIP(url))
|
206 |
+
features.append(haveAtSign(url))
|
207 |
+
features.append(getLength(url))
|
208 |
+
features.append(getDepth(url))
|
209 |
+
features.append(redirection(url))
|
210 |
+
features.append(httpDomain(url))
|
211 |
+
features.append(tinyURL(url))
|
212 |
+
features.append(prefixSuffix(url))
|
213 |
+
|
214 |
+
try:
|
215 |
+
global state
|
216 |
+
|
217 |
+
domain_name = whois.whois(urlparse(url).netloc)
|
218 |
+
|
219 |
+
if domain_name.get('domain_name'):
|
220 |
+
state = 0
|
221 |
+
|
222 |
+
else:
|
223 |
+
state = 1
|
224 |
+
dns = 0 if socket.gethostbyname(domain_name.domain_name[0]) else 1
|
225 |
+
except:
|
226 |
+
dns = 1
|
227 |
+
|
228 |
+
features.append(dns)
|
229 |
+
features.append(web_traffic(url))
|
230 |
+
features.append(1 if dns == 1 else domainAge(domain_name))
|
231 |
+
features.append(1 if dns == 1 else domainEnd(domain_name))
|
232 |
+
|
233 |
+
features.append(iframe(response))
|
234 |
+
features.append(mouseOver(response))
|
235 |
+
features.append(rightClick(response))
|
236 |
+
features.append(forwarding(response))
|
237 |
+
|
238 |
+
return features
|
239 |
+
|
240 |
+
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
|
241 |
+
'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic',
|
242 |
+
'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']
|
243 |
+
|
244 |
+
# I @ L D R D t P D T A E i M R F L
|
245 |
+
# . . . . . .
|
246 |
+
|
247 |
+
# 0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0 0
|
248 |
+
# 0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0 Y
|
249 |
+
# 0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0 -
|
250 |
+
|
251 |
+
# . .
|
252 |
+
# 0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1 0
|
253 |
+
# 0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0
|
254 |
+
# 0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0 -
|
255 |
+
|
256 |
+
# 0,0,1,3,0,0,0,0,0,0,1,1,0,0,1,0 1
|
257 |
+
# 0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0
|
258 |
+
# 0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0 -
|
259 |
+
|
260 |
+
# Prints : site. history. array. pred.
|