Spaces:
Runtime error
Runtime error
Niv Sardi
commited on
Commit
•
7c115c7
1
Parent(s):
cab11aa
reorganize vendor.py and log
Browse filesSigned-off-by: Niv Sardi <xaiki@evilgiggle.com>
- crawler/vendor.py +21 -11
crawler/vendor.py
CHANGED
@@ -12,22 +12,20 @@ from entity import Entity
|
|
12 |
from common import selectors
|
13 |
import screenshot
|
14 |
|
15 |
-
def
|
16 |
-
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
|
17 |
-
|
18 |
ssl_url = e.url.split("/")[2]
|
19 |
try:
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
24 |
|
|
|
|
|
25 |
logos = soup.select(selectors.logo)
|
26 |
-
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
27 |
|
28 |
-
fn = f"{e.DATA_PATH}/cert"
|
29 |
-
with open(fn, 'w') as f:
|
30 |
-
f.write(cert)
|
31 |
i = 0
|
32 |
lfn = []
|
33 |
for l in logos:
|
@@ -43,6 +41,18 @@ def query_vendor_site(e: Entity):
|
|
43 |
shutil.copyfileobj(res.raw, f)
|
44 |
lfn.append(fn)
|
45 |
i+=1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
screenshot.sc_entity(e)
|
47 |
return (fn, lfn)
|
48 |
|
|
|
12 |
from common import selectors
|
13 |
import screenshot
|
14 |
|
15 |
+
def write_cert(e: Entity):
|
|
|
|
|
16 |
ssl_url = e.url.split("/")[2]
|
17 |
try:
|
18 |
+
cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
|
19 |
+
with open(f"{e.DATA_PATH}/cert", 'w') as f:
|
20 |
+
f.write(cert)
|
21 |
+
except Exception as err:
|
22 |
+
with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
|
23 |
+
f.write(str(err))
|
24 |
|
25 |
+
def get_logos(e: Entity, page):
|
26 |
+
soup = BeautifulSoup(page.content, "html.parser")
|
27 |
logos = soup.select(selectors.logo)
|
|
|
28 |
|
|
|
|
|
|
|
29 |
i = 0
|
30 |
lfn = []
|
31 |
for l in logos:
|
|
|
41 |
shutil.copyfileobj(res.raw, f)
|
42 |
lfn.append(fn)
|
43 |
i+=1
|
44 |
+
|
45 |
+
def query_vendor_site(e: Entity):
|
46 |
+
pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
|
47 |
+
|
48 |
+
try:
|
49 |
+
page = requests.get(e.url)
|
50 |
+
except Exception:
|
51 |
+
e.url = e.url.replace('http', 'https')
|
52 |
+
page = requests.get(e.url)
|
53 |
+
|
54 |
+
write_cert(e)
|
55 |
+
get_logos(e, page)
|
56 |
screenshot.sc_entity(e)
|
57 |
return (fn, lfn)
|
58 |
|