AppleJupyter-test

Runtime error

AppleJupyter-test / cyberharem /dataset /video /crawler.py

LittleApple_fp16

upload

69a6cef about 1 year ago

2.9 kB

	import glob
	import logging
	import os.path
	import re
	import zipfile
	from typing import Optional, Union, List

	from hbutils.system import TemporaryDirectory
	from huggingface_hub import hf_hub_url
	from unidecode import unidecode
	from waifuc.action import CCIPAction, FilterSimilarAction, RandomFilenameAction
	from waifuc.source import EmptySource, LocalSource

	from ..crawler import crawl_dataset_to_huggingface
	from ...utils import download_file


	def crawl_base_to_huggingface(
	source_repository: str, ch_id: Union[int, List[int]],
	name: str, repository: Optional[str] = None,
	limit: Optional[int] = 200, min_images: int = 10,
	no_r18: bool = False, bg_color: str = 'white', drop_multi: bool = True,
	repo_type: str = 'dataset', revision: str = 'main', path_in_repo: str = '.',
	skip_preprocess: bool = True, parallel: bool = True, standalone_ccip: bool = True,
	keep_cnt_ratio: bool = True,
	):
	ch_ids = [ch_id] if isinstance(ch_id, int) else ch_id
	source = EmptySource()
	if not repository:
	repository = 'CyberHarem/' + re.sub(r'[\W_]+', '_', unidecode(name.lower())).strip('_').lower() + \
	'_' + source_repository.split('/')[-1]
	logging.info(f'Target repository name {repository!r} will be used.')
	with TemporaryDirectory() as td:
	img_cnts = []
	for cid in ch_ids:
	url = hf_hub_url(source_repository, filename=f'{cid}/dataset.zip', repo_type='dataset')
	os.makedirs(os.path.join(td, str(cid)), exist_ok=True)
	zip_file = os.path.join(td, str(cid), 'dataset.zip')
	download_file(url, zip_file)

	source_dir = os.path.join(td, str(cid), 'source')
	os.makedirs(source_dir, exist_ok=True)
	with zipfile.ZipFile(zip_file, 'r') as zf:
	zf.extractall(source_dir)
	img_cnts.append(len(glob.glob(os.path.join(source_dir, '*.png'))))

	total = sum(img_cnts)
	for cid, c_cnt in zip(ch_ids, img_cnts):
	source_dir = os.path.join(td, str(cid), 'source')
	new_source = LocalSource(source_dir, shuffle=True)
	if standalone_ccip:
	new_source = new_source.attach(CCIPAction())
	if keep_cnt_ratio:
	new_source = new_source[:int(round(c_cnt * 1.0 / total * limit))]

	if parallel:
	source = source \| new_source
	else:
	source = source + new_source
	if skip_preprocess:
	source = source.attach(
	FilterSimilarAction('all'),
	RandomFilenameAction(ext='.png'),
	)

	return crawl_dataset_to_huggingface(
	source, repository, name,
	limit, min_images, no_r18, bg_color, drop_multi, skip_preprocess,
	repo_type, revision, path_in_repo
	)