import os import warnings from typing import Iterator, Tuple, Union import requests from PIL import UnidentifiedImageError, Image from PIL.Image import DecompressionBombError from hbutils.system import urlsplit, TemporaryDirectory from .base import RootDataSource from ..model import ImageItem from ..utils import get_requests_session, download_file class NoURL(Exception): pass class WebDataSource(RootDataSource): def __init__(self, group_name: str, session: requests.Session = None, download_silent: bool = True): self.download_silent = download_silent self.session = session or get_requests_session() self.group_name = group_name def _iter_data(self) -> Iterator[Tuple[Union[str, int], str, dict]]: raise NotImplementedError # pragma: no cover def _iter(self) -> Iterator[ImageItem]: for id_, url, meta in self._iter_data(): _, ext_name = os.path.splitext(urlsplit(url).filename) if ext_name.lower() == '.gif': warnings.warn(f'{self.group_name.capitalize()} resource {id_} is a GIF image, skipped.') continue filename = f'{self.group_name}_{id_}{ext_name}' with TemporaryDirectory(ignore_cleanup_errors=True) as td: td_file = os.path.join(td, filename) try: download_file( url, td_file, desc=filename, session=self.session, silent=self.download_silent ) image = Image.open(td_file) image.load() except UnidentifiedImageError: warnings.warn(f'{self.group_name.capitalize()} resource {id_} unidentified as image, skipped.') continue except (IOError, DecompressionBombError) as err: warnings.warn(f'Skipped due to error: {err!r}') continue meta = {**meta, 'url': url} yield ImageItem(image, meta)