Niv Sardi commited on
Commit
7c115c7
1 Parent(s): cab11aa

reorganize vendor.py and log

Browse files

Signed-off-by: Niv Sardi <xaiki@evilgiggle.com>

Files changed (1) hide show
  1. crawler/vendor.py +21 -11
crawler/vendor.py CHANGED
@@ -12,22 +12,20 @@ from entity import Entity
12
  from common import selectors
13
  import screenshot
14
 
15
- def query_vendor_site(e: Entity):
16
- pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
17
-
18
  ssl_url = e.url.split("/")[2]
19
  try:
20
- page = requests.get(e.url)
21
- except Exception:
22
- page = requests.get(e.url.replace('http', 'https'))
23
- soup = BeautifulSoup(page.content, "html.parser")
 
 
24
 
 
 
25
  logos = soup.select(selectors.logo)
26
- cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
27
 
28
- fn = f"{e.DATA_PATH}/cert"
29
- with open(fn, 'w') as f:
30
- f.write(cert)
31
  i = 0
32
  lfn = []
33
  for l in logos:
@@ -43,6 +41,18 @@ def query_vendor_site(e: Entity):
43
  shutil.copyfileobj(res.raw, f)
44
  lfn.append(fn)
45
  i+=1
 
 
 
 
 
 
 
 
 
 
 
 
46
  screenshot.sc_entity(e)
47
  return (fn, lfn)
48
 
 
12
  from common import selectors
13
  import screenshot
14
 
15
+ def write_cert(e: Entity):
 
 
16
  ssl_url = e.url.split("/")[2]
17
  try:
18
+ cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
19
+ with open(f"{e.DATA_PATH}/cert", 'w') as f:
20
+ f.write(cert)
21
+ except Exception as err:
22
+ with open(f"{e.DATA_PATH}/error.log", 'w+') as f:
23
+ f.write(str(err))
24
 
25
+ def get_logos(e: Entity, page):
26
+ soup = BeautifulSoup(page.content, "html.parser")
27
  logos = soup.select(selectors.logo)
 
28
 
 
 
 
29
  i = 0
30
  lfn = []
31
  for l in logos:
 
41
  shutil.copyfileobj(res.raw, f)
42
  lfn.append(fn)
43
  i+=1
44
+
45
+ def query_vendor_site(e: Entity):
46
+ pathlib.Path(f"./data/{e.bco}").mkdir(parents=True, exist_ok=True)
47
+
48
+ try:
49
+ page = requests.get(e.url)
50
+ except Exception:
51
+ e.url = e.url.replace('http', 'https')
52
+ page = requests.get(e.url)
53
+
54
+ write_cert(e)
55
+ get_logos(e, page)
56
  screenshot.sc_entity(e)
57
  return (fn, lfn)
58