Spaces:
Runtime error
Runtime error
File size: 3,334 Bytes
485f76b f1ab0d5 485f76b 1a24a58 ae7097b f1ab0d5 7c115c7 74a29fd cf048df 1a24a58 cf048df 1a24a58 485f76b 1a24a58 cf048df 1a24a58 cf048df 1a24a58 cf048df 485f76b cf048df 485f76b cf048df 485f76b f1ab0d5 485f76b dd9165a 1a24a58 dd9165a 1a24a58 dd9165a 1a24a58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
#!/usr/bin/env python3
import concurrent.futures
import requests
from progress.bar import ChargingBar
from entity import Entity, read_entities
from common import defaults,mkdir
import web
PARALLEL = 20
def do_screenshot(e: Entity):
assert(e.url)
sfn = requests.post('http://puppet:8000/screenshot', json={
'url': e.url,
'id': e.id,
'path': f'{defaults.SCREENSHOT_PATH}/{e.bco}.png',
'logos': f'{defaults.LOGOS_DATA_PATH}/{e.bco}.png'
})
def get_entity_logo(e: Entity):
fn = f"{defaults.LOGOS_DATA_PATH}/{e.bco}.0.png"
web.get_img_logo(e.logo, fn)
def from_csv(args):
ACTIONS = []
if (args.certs):
ACTIONS.append(web.get_cert)
mkdir.make_dirs([defaults.CERTS_PATH])
if (args.logos):
ACTIONS.append(web.get_logos)
mkdir.make_dirs([defaults.LOGOS_DATA_PATH])
if (args.screenshots):
ACTIONS.append(do_screenshot)
mkdir.make_dirs([defaults.SCREENSHOT_PATH])
if (args.entity_logo):
ACTIONS.append(get_entity_logo)
mkdir.make_dirs([defaults.LOGOS_DATA_PATH])
print(ACTIONS)
with concurrent.futures.ThreadPoolExecutor(max_workers = args.parallel) as executor:
futures = {}
entities = read_entities(args.csv)
qs = len(entities.keys())*len(ACTIONS)
bar = ChargingBar(f'vendor ({qs} jobs)', max=qs)
for e in entities.values():
futures.update({executor.submit(f, e): (e, f) for f in ACTIONS})
print('waiting for futures')
for f in concurrent.futures.as_completed(futures):
(e, a) = futures[f]
try:
f.result()
except Exception as err:
print(f'{a}({e.url}) generated an exception: {err}')
bar.next()
bar.finish()
#query_vendor_site(Entity.from_dict({'url':'http://www.bancoprovincia.com.ar', 'bco':'debug'}))
#exit()
if __name__ == '__main__':
import argparse
print("🌏 getting vendor data")
parser = argparse.ArgumentParser(description='extract certificates and screenshots websites')
parser.add_argument('--csv', metavar='csv', type=str,
default=defaults.MAIN_CSV_PATH,
help='main database')
parser.add_argument('--parallel', metavar='parallel', type=int,
default=PARALLEL,
help='number of concurrent jobs')
parser.add_argument('--logos', metavar='logos', type=bool,
action=argparse.BooleanOptionalAction,
default=True, help='try to get logos')
parser.add_argument('--entity-logo', metavar='entity_logo', type=bool,
action=argparse.BooleanOptionalAction,
default=True, help='try to get logos form ENTITY')
parser.add_argument('--certs', metavar='certs', type=bool,
action=argparse.BooleanOptionalAction,
default=True, help='try to get certs')
parser.add_argument('--screenshots', metavar='screenshots', type=bool,
action=argparse.BooleanOptionalAction,
default=True, help='try to get screenshots')
args = parser.parse_args()
from_csv(args)
|