spoof-detect / python /get_entities.py
Niv Sardi
import python
1a24a58
#!/usr/bin/env python
import csv
import requests
import shutil
import re
from bs4 import BeautifulSoup
from progress.bar import ChargingBar
import concurrent.futures
import web
from entity import Entity
from common import selectors, defaults, mkdir
URL = 'http://www.bcra.gob.ar/SistemasFinancierosYdePagos/Entidades_financieras.asp'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
options = soup.find(class_='form-control').find_all('option')
mkdir.make_dirs([defaults.DATA_PATH, defaults.LOGOS_DATA_PATH])
def get_links(soup):
for l in soup.select('.post-pagina-interior'):
for a in l.select('a'):
if 'href' in a.attrs and a.attrs['href'].startswith('http'):
return a.attrs['href']
with open(f'{defaults.MAIN_CSV_PATH}.tmp', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(Entity.row_names())
bar = ChargingBar('get entities', max=len(options))
def get_bco(o, i):
(name, bco)= (o.text, o.attrs['value'])
page = requests.post(URL, data={'bco': bco}, stream=False)
soup = BeautifulSoup(page.content, 'html.parser')
img = f'https://www.bcra.gob.ar/Imagenes/logosbancos/{bco}.jpg'
e = Entity(name, id=i, bco=bco, logo=str(img), url=str(get_links(soup)))
writer.writerow(e.to_row())
i+=1
with concurrent.futures.ThreadPoolExecutor(max_workers = 20) as executor:
futures = {executor.submit(get_bco, o, i): o for (i, o) in enumerate(options[1:])}
for f in concurrent.futures.as_completed(futures):
o = futures[f]
try:
f.result()
except Exception as err:
print(f'({o}) generated an exception: {err}')
bar.next()
bar.finish()
shutil.move(f'{defaults.MAIN_CSV_PATH}.tmp', defaults.MAIN_CSV_PATH)
print(f'scrape finished, found {len(options[1:])} entities, dumped to {defaults.MAIN_CSV_PATH}')