File size: 6,597 Bytes
69a6cef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import datetime
import fnmatch
import json
import logging
import os.path
import textwrap
from typing import Tuple, Optional

import dateparser
import pandas as pd
from hbutils.string import plural_word
from hbutils.system import TemporaryDirectory
from huggingface_hub import CommitOperationAdd
from pyquery import PyQuery as pq
from tqdm.auto import tqdm

from ...utils import get_hf_client, get_hf_fs, get_requests_session, srequest, download_file

hf_client = get_hf_client()
hf_fs = get_hf_fs()


def get_animelist_info(bangumi_name) -> Tuple[Optional[str], Optional[str]]:
    session = get_requests_session()
    resp = srequest(
        session, 'GET', 'https://myanimelist.net/anime.php',
        params={
            'cat': 'anime',
            'q': bangumi_name,
        }
    )
    table = pq(resp.text)('.js-block-list.list table')
    for row in table('tr').items():
        bangumi_url = row('td:nth-child(1) a').attr('href')
        if not bangumi_url:
            continue

        r = srequest(session, 'GET', bangumi_url)
        p = pq(r.text)
        post_url = p("img[itemprop=image]").attr('data-src')
        if bangumi_url and post_url:
            return bangumi_url, post_url
    else:
        return None, None


def sync_bangumi_base(repository: str = 'BangumiBase/README'):
    cb_models = [item.modelId for item in hf_client.list_models(author='CyberHarem')]
    cb_datasets = [item.id for item in hf_client.list_datasets(author='CyberHarem')]

    with TemporaryDirectory() as td:
        readme_file = os.path.join(td, 'README.md')
        with open(readme_file, 'w') as f:
            rows, total_images, total_clusters, total_animes = [], 0, 0, 0
            for item in tqdm(list(hf_client.list_datasets(author='BangumiBase'))):
                if not hf_fs.exists(f'datasets/{item.id}/meta.json'):
                    logging.info(f'No meta information found for {item.id!r}, skipped')
                    continue

                meta = json.loads(hf_fs.read_text(f'datasets/{item.id}/meta.json'))
                bangumi_name = meta['name']
                safe_bangumi_name = bangumi_name.replace('`', ' ').replace('[', '(').replace(']', ')')
                suffix = item.id.split('/')[-1]
                datasets_cnt = len([x for x in cb_datasets if fnmatch.fnmatch(x, f'CyberHarem/*_{suffix}')])
                models_cnt = len([x for x in cb_models if fnmatch.fnmatch(x, f'CyberHarem/*_{suffix}')])

                page_url, post_url = get_animelist_info(bangumi_name)
                if post_url:
                    post_file = os.path.join(td, 'posts', f'{suffix}.jpg')
                    os.makedirs(os.path.dirname(post_file), exist_ok=True)
                    download_file(post_url, post_file)
                else:
                    post_file = None

                dataset_url = f'https://huggingface.co/datasets/{item.id}'
                post_md = f'![{suffix}]({os.path.relpath(post_file, td)})' if post_file else '(no post)'
                if page_url:
                    post_md = f'[{post_md}]({page_url})'
                last_modified = dateparser.parse(item.lastModified) \
                    if isinstance(item.lastModified, str) else item.lastModified
                rows.append({
                    'Post': post_md,
                    'Bangumi': f'[{safe_bangumi_name}]({dataset_url})',
                    'Last Modified': last_modified.strftime('%Y-%m-%d %H:%M'),
                    'Images': meta['total'],
                    'Clusters': len([x for x in meta['ids'] if x != -1]),
                    'Datasets': f'[{datasets_cnt}](https://huggingface.co/CyberHarem?'
                                f'search_models=_{suffix}&search_datasets=_{suffix})',
                    'Models': f'[{models_cnt}](https://huggingface.co/CyberHarem?'
                              f'search_models=_{suffix}&search_datasets=_{suffix})',
                })
                total_images += meta['total']
                total_clusters += len([x for x in meta['ids'] if x != -1])
                total_animes += 1

            print(textwrap.dedent(f"""
                ---
                title: README
                emoji: 🌖
                colorFrom: green
                colorTo: red
                sdk: static
                pinned: false
                ---

                ## What is this?

                This is a data hub utilized by the [DeepGHS team](https://huggingface.co/deepghs) for processing 
                anime series (in video format, including TV, OVA, movies, etc.).

                After downloading anime videos to our GPU cluster, we employ various computer vision algorithms to 
                extract frames, crop, and **cluster them based on character features**. These processed frames are 
                then uploaded here to reduce the manual sorting effort required for character images.

                The data in this repository will undergo automated secondary processing to remove noise, 
                after which it will be packaged and uploaded to [CyberHarem](https://huggingface.co/CyberHarem). 
                It will then be integrated into an automated pipeline for training character LoRA.

                ## Current Anime Database (constantly updated)

                Last updated on: {datetime.datetime.now().strftime("%Y-%m-%d %H:%M")},
                contains {plural_word(total_animes, "anime")}, {plural_word(total_images, "image")} 
                and {plural_word(total_clusters, "cluster")} in total.
            """).strip(), file=f)

            rows = sorted(rows, key=lambda x: dateparser.parse(x['Last Modified']), reverse=True)
            df = pd.DataFrame(rows)
            print(df.to_markdown(index=False), file=f)

        operations = []
        for directory, _, files in os.walk(td):
            for file in files:
                filename = os.path.abspath(os.path.join(directory, file))
                relpath = os.path.relpath(filename, td)
                operations.append(CommitOperationAdd(
                    path_in_repo=relpath,
                    path_or_fileobj=filename,
                ))

        current_time = datetime.datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %Z')
        commit_message = f'Update lfs images, on {current_time}'
        logging.info(f'Updating lfs images to repository {repository!r} ...')
        hf_client.create_commit(
            repository,
            operations,
            commit_message=commit_message,
            repo_type='space',
            revision='main',
        )