Niv Sardi commited on
Commit
ae7097b
1 Parent(s): 4b890a6

defaults defaults defaults (and types)

Browse files

Signed-off-by: Niv Sardi <xaiki@evilgiggle.com>

python/common/defaults.py CHANGED
@@ -1 +1,16 @@
1
  DATA_PATH='./data'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  DATA_PATH='./data'
2
+
3
+ LABELS_PATH=f'{DATA_PATH}/labels'
4
+ IMAGES_PATH=f'{DATA_PATH}/images'
5
+ CERTS_PATH=f'{DATA_PATH}/certs'
6
+
7
+ SQUARES_DATA_PATH=f'{DATA_PATH}/squares'
8
+ SQUARES_LABELS_PATH=f'{SQUARES_DATA_PATH}/labels'
9
+ SQUARES_IMAGES_PATH=f'{SQUARES_DATA_PATH}/images'
10
+
11
+ DEBUG_PATH=f'{DATA_PATH}/debug'
12
+ DEBUG_SQUARES_PATH=f'{DEBUG_PATH}/squares'
13
+
14
+ LOGOS_DATA_PATH=f'{DATA_PATH}/logos'
15
+
16
+ MAIN_CSV_PATH=f'{DATA_PATH}/entities.csv'
python/common/mkdir.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import pathlib
2
+
3
+ def make_dirs(dirs: [str]):
4
+ for p in dirs:
5
+ pathlib.Path(p).mkdir(parents=True, exist_ok=True)
6
+
python/entity.py CHANGED
@@ -2,8 +2,10 @@
2
  import csv
3
  from typing import NamedTuple
4
 
 
 
5
  def read_entities(fn):
6
- with open('./data/entidades.csv', newline='') as csvfile:
7
  reader = csv.DictReader(csvfile)
8
  bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
9
  return bcos
 
2
  import csv
3
  from typing import NamedTuple
4
 
5
+ from common import defaults
6
+
7
  def read_entities(fn):
8
+ with open(defaults.MAIN_DATA_PATH, newline='') as csvfile:
9
  reader = csv.DictReader(csvfile)
10
  bcos = { d['bco']:update(d, {'id': i}) for i, d in enumerate(reader)}
11
  return bcos
python/imtool.py CHANGED
@@ -3,10 +3,10 @@
3
  import os
4
  import math
5
  import cv2
6
- import pathlib
7
  from typing import NamedTuple
8
 
9
  from entity import Entity
 
10
 
11
  TILE_SIZE = 416
12
  TILE_OVERLAP = 0.8
@@ -69,9 +69,7 @@ def crop(id, fn, logos):
69
  img_out = f"./data/squares/images"
70
  txt_out = f"./data/squares/labels"
71
  debug_out = f"./data/debug"
72
- pathlib.Path(debug_out).mkdir(parents=True, exist_ok=True)
73
- pathlib.Path(img_out).mkdir(parents=True, exist_ok=True)
74
- pathlib.Path(txt_out).mkdir(parents=True, exist_ok=True)
75
 
76
  im = cv2.imread(fn)
77
  rim = cv2.imread(fn)
 
3
  import os
4
  import math
5
  import cv2
 
6
  from typing import NamedTuple
7
 
8
  from entity import Entity
9
+ from common import mkdir
10
 
11
  TILE_SIZE = 416
12
  TILE_OVERLAP = 0.8
 
69
  img_out = f"./data/squares/images"
70
  txt_out = f"./data/squares/labels"
71
  debug_out = f"./data/debug"
72
+ mkdir.make_dirs[debug_out, img_out, txt_out]
 
 
73
 
74
  im = cv2.imread(fn)
75
  rim = cv2.imread(fn)
python/main.py CHANGED
@@ -1,5 +1,4 @@
1
  import csv
2
- import pathlib
3
  import requests
4
  import shutil
5
 
@@ -7,18 +6,16 @@ from bs4 import BeautifulSoup
7
  from progress.bar import ChargingBar
8
 
9
  from entity import Entity
10
- from common import selectors
11
- from common import defaults
12
 
13
- pathlib.Path(f'{defaults.DATA_PATH}/logos').mkdir(parents=True, exist_ok=True)
14
-
15
- DATA_FILE = './data/entidades.csv'
16
  URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
17
  page = requests.get(URL)
18
  soup = BeautifulSoup(page.content, 'html.parser')
19
 
20
  options = soup.find(class_='form-control').find_all('option')
21
- with open(f'{DATA_FILE}.tmp', 'w', newline='') as csvfile:
 
 
22
  writer = csv.writer(csvfile)
23
  writer.writerow(Entity.row_names())
24
 
@@ -46,11 +43,11 @@ with open(f'{DATA_FILE}.tmp', 'w', newline='') as csvfile:
46
  except TypeError:
47
  print('ERROR', a)
48
 
49
- e = Entity(name, id=i, bco=bco, logo=img, url=a)
50
  writer.writerow(e.to_row())
51
  i+=1
52
  bar.next()
53
  bar.finish()
54
 
55
- shutil.move(f'{DATA_FILE}.tmp', DATA_FILE)
56
  print('scrape finished')
 
1
  import csv
 
2
  import requests
3
  import shutil
4
 
 
6
  from progress.bar import ChargingBar
7
 
8
  from entity import Entity
9
+ from common import selectors, defaults, mkdir
 
10
 
 
 
 
11
  URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
12
  page = requests.get(URL)
13
  soup = BeautifulSoup(page.content, 'html.parser')
14
 
15
  options = soup.find(class_='form-control').find_all('option')
16
+ mkdir.make_dirs([defaults.DATA_PATH])
17
+
18
+ with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
19
  writer = csv.writer(csvfile)
20
  writer.writerow(Entity.row_names())
21
 
 
43
  except TypeError:
44
  print('ERROR', a)
45
 
46
+ e = Entity(name, id=i, bco=bco, logo=str(img), url=str(a))
47
  writer.writerow(e.to_row())
48
  i+=1
49
  bar.next()
50
  bar.finish()
51
 
52
+ shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
53
  print('scrape finished')
python/screenshot.py CHANGED
@@ -9,7 +9,7 @@ from selenium.webdriver.common.by import By
9
 
10
  from common import selectors
11
  from entity import Entity
12
- from common import defaults
13
 
14
  options = webdriver.FirefoxOptions()
15
  options.add_argument("--headless")
@@ -22,18 +22,23 @@ def coord_to_point(c):
22
 
23
  driver = webdriver.Firefox(options=options)
24
  def sc_entity(e: Entity):
25
- print(e)
 
 
 
 
 
26
  driver.implicitly_wait(10)
27
  driver.get(e.url)
28
- driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
29
- driver.save_full_page_screenshot(f"{defaults.DATA_PATH}/{e.bco}.full.png")
30
 
31
  logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
32
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
33
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
34
- with open(f"{defaults.DATA_PATH}/{e.bco}.full.txt", 'w') as f:
35
  for i in logos:
36
- f.write(f"{e.bco} {coord_to_point(i.rect)}\n")
37
 
38
  if __name__ == '__main__':
39
  sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
 
9
 
10
  from common import selectors
11
  from entity import Entity
12
+ from common import defaults,mkdir
13
 
14
  options = webdriver.FirefoxOptions()
15
  options.add_argument("--headless")
 
22
 
23
  driver = webdriver.Firefox(options=options)
24
  def sc_entity(e: Entity):
25
+ print(f'screenshoting: {e}')
26
+ mkdir.make_dirs([
27
+ defaults.IMAGES_PATH,
28
+ defaults.LABELS_PATH,
29
+ ])
30
+
31
  driver.implicitly_wait(10)
32
  driver.get(e.url)
33
+ #driver.save_screenshot(f"{defaults.DATA_PATH}/{e.bco}.png")
34
+ driver.save_full_page_screenshot(f"{defaults.IMAGES_PATH}/{e.bco}.full.png")
35
 
36
  logos = driver.find_elements(By.CSS_SELECTOR, selectors.img_logo) or []
37
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.id_logo) or [])
38
  logos.extend(driver.find_elements(By.CSS_SELECTOR, selectors.cls_logo) or [])
39
+ with open(f"{defaults.LABELS_PATH}/{e.bco}.full.txt", 'w') as f:
40
  for i in logos:
41
+ f.write(f"{e.id} {coord_to_point(i.rect)}\n")
42
 
43
  if __name__ == '__main__':
44
  sc_entity(Entity.from_dict({'url': 'http://www.bbva.com.ar', 'bco': 'debug'}))
python/vendor.py CHANGED
@@ -1,5 +1,4 @@
1
  #!/usr/bin/env python3
2
- import pathlib
3
  import csv
4
  import concurrent.futures
5
  import requests
@@ -7,7 +6,7 @@ import requests
7
  from progress.bar import ChargingBar
8
 
9
  from entity import Entity
10
- from common import defaults
11
  import screenshot
12
  import web
13
 
@@ -40,6 +39,4 @@ def from_csv(fn):
40
  #exit()
41
 
42
  if __name__ == '__main__':
43
- #pathlib.Path(defaults.DATA_PATH).mkdir(parents=True, exist_ok=True)
44
- pathlib.Path(f"{defaults.DATA_PATH}/logos").mkdir(parents=True, exist_ok=True)
45
- from_csv(f"{defaults.DATA_PATH}/entidades.csv")
 
1
  #!/usr/bin/env python3
 
2
  import csv
3
  import concurrent.futures
4
  import requests
 
6
  from progress.bar import ChargingBar
7
 
8
  from entity import Entity
9
+ from common import defaults,mkdir
10
  import screenshot
11
  import web
12
 
 
39
  #exit()
40
 
41
  if __name__ == '__main__':
42
+ from_csv(defaults.MAIN_CSV_PATH)
 
 
python/web.py CHANGED
@@ -5,7 +5,7 @@ import requests
5
  from bs4 import BeautifulSoup
6
 
7
  from entity import Entity
8
- from common import selectors, defaults
9
 
10
  def get_page(e: Entity):
11
  try:
@@ -17,9 +17,10 @@ def get_page(e: Entity):
17
 
18
  def get_cert(e: Entity):
19
  ssl_url = e.url.split("/")[2]
 
20
  try:
21
  cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
22
- fn = f"{defaults.DATA_PATH}/{e.bco}.cert"
23
  with open(fn, 'w') as f:
24
  f.write(cert)
25
  except Exception as err:
@@ -39,6 +40,8 @@ def get_logos(e: Entity, page):
39
  logos.extend(soup.select(selectors.id_logo))
40
  logos.extend(soup.select(selectors.cls_logo))
41
 
 
 
42
  i = 0
43
  lfn = []
44
  for l in logos:
@@ -46,7 +49,7 @@ def get_logos(e: Entity, page):
46
  src = l.attrs['src']
47
  ext = src.split('.')[-1].split('/')[-1]
48
  if not src.startswith('http'): src = e.url + src
49
- fn = f"{defaults.DATA_PATH}/logos/{e.bco}.{i}.{ext}"
50
  lfn.append(get_img_logo(src, fn))
51
  i+=1
52
  return lfn
 
5
  from bs4 import BeautifulSoup
6
 
7
  from entity import Entity
8
+ from common import selectors, defaults, mkdir
9
 
10
  def get_page(e: Entity):
11
  try:
 
17
 
18
  def get_cert(e: Entity):
19
  ssl_url = e.url.split("/")[2]
20
+ mkdir.make_dirs(defaults.CERTS_PATH)
21
  try:
22
  cert = ssl.get_server_certificate((ssl_url, 443), ca_certs=None)
23
+ fn = f"{defaults.CERTS_PATH}/{e.bco}.cert"
24
  with open(fn, 'w') as f:
25
  f.write(cert)
26
  except Exception as err:
 
40
  logos.extend(soup.select(selectors.id_logo))
41
  logos.extend(soup.select(selectors.cls_logo))
42
 
43
+ mkdir.make_dirs(defaults.LOGOS_DATA_PATH)
44
+
45
  i = 0
46
  lfn = []
47
  for l in logos:
 
49
  src = l.attrs['src']
50
  ext = src.split('.')[-1].split('/')[-1]
51
  if not src.startswith('http'): src = e.url + src
52
+ fn = f"{defaults.LOGOS_DATA_PATH}/{e.bco}.{i}.{ext}"
53
  lfn.append(get_img_logo(src, fn))
54
  i+=1
55
  return lfn