Spaces:
Runtime error
Runtime error
import datetime | |
import glob | |
import json | |
import os.path | |
import zipfile | |
from typing import Union, Tuple, List, Optional | |
import pandas as pd | |
from ditk import logging | |
from gchar.games import get_character | |
from gchar.games.base import Character | |
from hbutils.string import plural_word | |
from hbutils.system import TemporaryDirectory | |
from huggingface_hub import CommitOperationAdd, hf_hub_url | |
from waifuc.action import NoMonochromeAction, FilterSimilarAction, \ | |
TaggingAction, PersonSplitAction, FaceCountAction, CCIPAction, ModeConvertAction, ClassFilterAction, \ | |
FileOrderAction, RatingFilterAction, BaseAction, RandomFilenameAction, PaddingAlignAction, ThreeStageSplitAction, \ | |
AlignMinSizeAction, MinSizeFilterAction, FilterAction | |
from waifuc.action.filter import MinAreaFilterAction | |
from waifuc.export import SaveExporter, TextualInversionExporter | |
from waifuc.model import ImageItem | |
from waifuc.source import GcharAutoSource, BaseDataSource, LocalSource | |
from waifuc.utils import task_ctx | |
from ..utils import number_to_tag, get_ch_name, get_alphabet_name, get_hf_client, download_file, get_hf_fs | |
def get_source(source) -> BaseDataSource: | |
if isinstance(source, (str, Character)): | |
source = GcharAutoSource(source, main_sources_count=5) | |
elif isinstance(source, BaseDataSource): | |
pass | |
else: | |
raise TypeError(f'Unknown source type - {source!r}.') | |
return source | |
def get_main_source(source, no_r18: bool = False, bg_color: str = 'white', | |
no_monochrome_check: bool = False, | |
drop_multi: bool = True, skip: bool = False) -> BaseDataSource: | |
source: BaseDataSource = get_source(source) | |
if not skip: | |
actions = [ModeConvertAction('RGB', bg_color)] | |
if not no_monochrome_check: | |
actions.append(NoMonochromeAction()) # no monochrome, greyscale or sketch | |
actions.append(ClassFilterAction(['illustration', 'bangumi'])) # no comic or 3d | |
if no_r18: | |
actions.append(RatingFilterAction(['safe', 'r15'])) | |
actions.append(FilterSimilarAction('all')) # filter duplicated images | |
if drop_multi: | |
actions.append(FaceCountAction(count=1, level='n')) # drop images with 0 or >1 faces | |
actions.extend([ | |
PersonSplitAction(level='n'), # crop for each person | |
FaceCountAction(count=1, level='n'), | |
FileOrderAction(), # Rename files in order | |
# CCIPAction(min_val_count=15), # CCIP, filter the character you may not want to see in dataset | |
FilterSimilarAction('all'), # filter duplicated images | |
MinSizeFilterAction(320), | |
TaggingAction(force=True, character_threshold=1.01), | |
]) | |
actions.append(RandomFilenameAction(ext='.png')) | |
else: | |
actions = [] | |
return source.attach(*actions) | |
def actions_parse(actions: Union[int, Tuple[int, int], List[BaseAction]], bg_color: str = 'white'): | |
if isinstance(actions, list): | |
return actions | |
elif isinstance(actions, tuple): | |
width, height = actions | |
return [PaddingAlignAction((width, height), bg_color)] | |
elif isinstance(actions, int): | |
return [AlignMinSizeAction(actions)] | |
else: | |
raise TypeError(f'Unknown post action type - {actions!r}.') | |
class CustomMinSizeAction(FilterAction): | |
def __init__(self, main_size: int = 280, min_eye_size: int = 180): | |
self.main_size = main_size | |
self.min_eye_size = min_eye_size | |
def check(self, item: ImageItem) -> bool: | |
min_size = min(item.image.width, item.image.height) | |
if 'crop' in item.meta and item.meta['crop']['type'] == 'eye': | |
return min_size >= self.min_eye_size | |
else: | |
return min_size >= self.main_size | |
_SOURCES = { | |
'native': [ | |
TaggingAction(force=False, character_threshold=1.01), | |
], | |
'stage3': [ | |
ThreeStageSplitAction(split_person=False), | |
FilterSimilarAction(), | |
MinSizeFilterAction(280), | |
TaggingAction(force=False, character_threshold=1.01), | |
], | |
'stage3-eyes': [ | |
ThreeStageSplitAction(split_person=False, split_eyes=True), | |
FilterSimilarAction(), | |
CustomMinSizeAction(280, 180), | |
TaggingAction(force=False, character_threshold=1.01), | |
] | |
} | |
_DEFAULT_RESOLUTIONS = { | |
'raw': ('native', [], 'Raw data with meta information.'), | |
'raw-stage3': ('stage3', [], '3-stage cropped raw data with meta information.'), | |
'raw-stage3-eyes': ('stage3-eyes', [], '3-stage cropped (with eye-focus) raw data with meta information.'), | |
'384x512': ('native', (384, 512), '384x512 aligned dataset.'), | |
# '512x512': ('native', (512, 512), '512x512 aligned dataset.'), | |
'512x704': ('native', (512, 704), '512x704 aligned dataset.'), | |
# '640x640': ('native', (640, 640), '640x640 aligned dataset.'), | |
'640x880': ('native', (640, 880), '640x880 aligned dataset.'), | |
'stage3-640': ('stage3', 640, '3-stage cropped dataset with the shorter side not exceeding 640 pixels.'), | |
'stage3-800': ('stage3', 800, '3-stage cropped dataset with the shorter side not exceeding 800 pixels.'), | |
'stage3-p512-640': ('stage3', [MinAreaFilterAction(512), AlignMinSizeAction(640)], | |
'3-stage cropped dataset with the area not less than 512x512 pixels.'), | |
# 'stage3-1200': ('stage3', 1200, '3-stage cropped dataset with the shorter side not exceeding 1200 pixels.'), | |
'stage3-eyes-640': ('stage3-eyes', 640, '3-stage cropped (with eye-focus) dataset ' | |
'with the shorter side not exceeding 640 pixels.'), | |
'stage3-eyes-800': ('stage3-eyes', 800, '3-stage cropped (with eye-focus) dataset ' | |
'with the shorter side not exceeding 800 pixels.'), | |
} | |
DATASET_PVERSION = 'v1.4' | |
def crawl_dataset_to_huggingface( | |
source: Union[str, Character, BaseDataSource], repository: Optional[str] = None, | |
name: Optional[str] = None, limit: Optional[int] = 10000, min_images: int = 450, | |
no_r18: bool = False, bg_color: str = 'white', drop_multi: bool = True, skip_preprocess: bool = False, | |
no_monochrome_check: bool = False, | |
repo_type: str = 'dataset', revision: str = 'main', path_in_repo: str = '.', private: bool = False, | |
): | |
if isinstance(source, (str, Character)): | |
if isinstance(source, str): | |
source = get_character(source) | |
name = f'{source.enname} ({source.__official_name__})' | |
if not repository: | |
repository = f'AppleHarem/{get_ch_name(source)}' | |
else: | |
if name is None: | |
raise ValueError('Name must be specified when source is not str or character.') | |
if not repository: | |
repository = f'AppleHarem/{get_alphabet_name(name)}' | |
hf_fs = get_hf_fs() | |
if hf_fs.exists(f'datasets/{repository}/.gitattributes'): | |
logging.warn(f'{repository} exists, skipped.') | |
return | |
origin_source = get_main_source(source, no_r18, bg_color, no_monochrome_check, drop_multi, skip_preprocess) | |
with TemporaryDirectory() as td: | |
# save origin directory | |
origin_dir = os.path.join(td, 'origin') | |
os.makedirs(origin_dir, exist_ok=True) | |
if limit is not None: | |
origin_source = origin_source[:limit] | |
with task_ctx('origin'): | |
origin_source.export(SaveExporter(origin_dir)) | |
img_count = len(glob.glob(os.path.join(origin_dir, '*.png'))) | |
if img_count < min_images: | |
logging.warn(f'Only {plural_word(img_count, "image")} found for {name} which is too few, ' | |
f'skip post-processing and uploading.') | |
return | |
source_dir = os.path.join(td, 'source') | |
os.makedirs(source_dir, exist_ok=True) | |
for sname, actions in _SOURCES.items(): | |
with task_ctx(f'source/{sname}'): | |
LocalSource(origin_dir).attach(*actions).export(SaveExporter(os.path.join(source_dir, sname))) | |
processed_dir = os.path.join(td, 'processed') | |
os.makedirs(processed_dir, exist_ok=True) | |
archive_dir = os.path.join(td, 'archives') | |
os.makedirs(archive_dir, exist_ok=True) | |
files_to_upload: List[Tuple[str, str]] = [] | |
resolutions = _DEFAULT_RESOLUTIONS | |
columns = ['Name', 'Images', 'Download', 'Description'] | |
rows = [] | |
for rname, (sname, actions, description) in resolutions.items(): | |
actions = actions_parse(actions, bg_color) | |
ox = LocalSource(os.path.join(source_dir, sname)) | |
current_processed_dir = os.path.join(processed_dir, rname) | |
with task_ctx(f'archive/{rname}'): | |
if not rname.startswith('raw'): # raw is preserved for exporting json data | |
ox.attach(*actions).export(TextualInversionExporter(current_processed_dir)) | |
else: | |
ox.attach(*actions).export(SaveExporter(current_processed_dir)) | |
current_img_cnt = len(glob.glob(os.path.join(current_processed_dir, '*.png'))) | |
zip_file = os.path.join(archive_dir, f'dataset-{rname}.zip') | |
with zipfile.ZipFile(zip_file, mode='w') as zf: | |
for directory, _, files in os.walk(current_processed_dir): | |
for file in files: | |
file_path = os.path.join(directory, file) | |
rel_file_path = os.path.relpath(file_path, current_processed_dir) | |
zf.write( | |
file_path, | |
'/'.join(rel_file_path.split(os.sep)) | |
) | |
rows.append(( | |
rname, | |
current_img_cnt, | |
f'[Download]({os.path.basename(zip_file)})', | |
description, | |
)) | |
files_to_upload.append((zip_file, os.path.basename(zip_file))) | |
meta_file = os.path.join(td, 'meta.json') | |
with open(meta_file, 'w', encoding='utf-8') as mf: | |
json.dump({ | |
'name': name, | |
'version': DATASET_PVERSION, | |
}, mf, indent=4, sort_keys=True, ensure_ascii=False) | |
files_to_upload.append((meta_file, 'meta.json')) | |
readme_file = os.path.join(td, 'README.md') | |
with open(readme_file, 'w', encoding='utf-8') as rf: | |
print(f'---', file=rf) | |
print(f'license: mit', file=rf) | |
print(f'task_categories:', file=rf) | |
print(f'- text-to-image', file=rf) | |
print(f'tags:', file=rf) | |
print(f'- art', file=rf) | |
print(f'- not-for-all-audiences', file=rf) | |
print(f'size_categories:', file=rf) | |
print(f'- {number_to_tag(img_count)}', file=rf) | |
print(f'---', file=rf) | |
print(f'', file=rf) | |
print(f'# Dataset of {name}', file=rf) | |
print(f'', file=rf) | |
print(f'This is the dataset of {name}, ' | |
f'containing {plural_word(img_count, "images")} and their tags.', file=rf) | |
print(f'', file=rf) | |
print(f'Images are crawled from many sites (e.g. danbooru, pixiv, zerochan ...), ' | |
f'the auto-crawling system is powered by [DeepGHS Team](https://github.com/deepghs)' | |
f'([huggingface organization](https://huggingface.co/deepghs)). ', file=rf) | |
print(f'This is a WebUI contains crawlers and other thing: ' | |
f'([LittleAppleWebUI](https://github.com/LittleApple-fp16/LittleAppleWebUI))', file=rf) | |
print(f'', file=rf) | |
df = pd.DataFrame(columns=columns, data=rows) | |
print(df.to_markdown(index=False), file=rf) | |
print('', file=rf) | |
files_to_upload.append((readme_file, 'README.md')) | |
hf_client = get_hf_client() | |
hf_fs = get_hf_fs() | |
logging.info(f'Initialize repository {repository!r}') | |
if not hf_fs.exists(f'datasets/{repository}/.gitattributes'): | |
hf_client.create_repo(repo_id=repository, repo_type=repo_type, exist_ok=True, private=private) | |
current_time = datetime.datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %Z') | |
commit_message = f"Publish character {name}, on {current_time}" | |
logging.info(f'Publishing character {name!r} to repository {repository!r} ...') | |
hf_client.create_commit( | |
repository, | |
[ | |
CommitOperationAdd( | |
path_in_repo=f'{path_in_repo}/{filename}', | |
path_or_fileobj=local_file, | |
) for local_file, filename in files_to_upload | |
], | |
commit_message=commit_message, | |
repo_type=repo_type, | |
revision=revision, | |
run_as_future=False, | |
) | |
def remake_dataset_to_huggingface( | |
repository: Optional[str] = None, limit: Optional[int] = 200, min_images: int = 10, | |
no_r18: bool = False, bg_color: str = 'white', drop_multi: bool = True, | |
repo_type: str = 'dataset', revision: str = 'main', path_in_repo: str = '.', | |
): | |
hf_fs = get_hf_fs() | |
with TemporaryDirectory() as td: | |
zip_file = os.path.join(td, 'dataset-raw.zip') | |
download_file(hf_hub_url(repository, 'dataset-raw.zip', repo_type='dataset'), zip_file) | |
source_dir = os.path.join(td, 'source') | |
os.makedirs(source_dir, exist_ok=True) | |
with zipfile.ZipFile(zip_file, 'r') as zf: | |
zf.extractall(source_dir) | |
source = LocalSource(source_dir) | |
name = None | |
if hf_fs.exists(f'datasets/{repository}/meta.json'): | |
meta_json = json.loads(hf_fs.read_text(f'datasets/{repository}/meta.json')) | |
if 'name' in meta_json: | |
name = meta_json['name'] | |
name = name or repository.split('/')[-1] | |
return crawl_dataset_to_huggingface( | |
source, repository, name, | |
limit, min_images, no_r18, bg_color, drop_multi, True, | |
repo_type, revision, path_in_repo | |
) | |