More joy things

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

Files changed (6) hide show

__pycache__/e6db_reader.cpython-312.pyc +0 -0
data/tag2idx.json.gz +3 -0
data/tags.txt.gz +3 -0
data/tags_categories.bin.gz +3 -0
demo.py +7 -0
e6db_reader.py +331 -0

__pycache__/e6db_reader.cpython-312.pyc ADDED Viewed

Binary file (16.5 kB). View file

data/tag2idx.json.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b6d0566323e99297d88d9cb6d8f7403e0f5eebc65670a71f303753a97f9786b
+size 3840505

data/tags.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc537f3afe6ae8c152670ba7aa871989286d9139d7be8b1c40cb53ea36cafe0f
+size 2630619

data/tags_categories.bin.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3ba9a95809e680f40a15a41d57a507630eca5f87ca8ee3518ed56aff662413e
+size 109543

demo.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from e6db_reader import TagNormalizer, tag_categories, tag_category2id
+tn = TagNormalizer('data')
+tn.map_inputs(lambda tag, tid: tag.replace('_', ' '))
+for tag in ['pokemon', 'pikachu', 'charizard', 'loona']:
+    print(tag, tn.get_category(tag))

e6db_reader.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"Python only utils (no dependencies)"
+import gzip
+import json
+import logging
+import math
+import warnings
+from pathlib import Path
+from typing import Callable, Iterable
+logger = logging.getLogger(__name__)
+tag_categories = [
+    "general",
+    "artist",
+    None,  # Invalid catid
+    "copyright",
+    "character",
+    "species",
+    "invalid",
+    "meta",
+    "lore",
+    "pool",
+]
+tag_category2id = {v: k for k, v in enumerate(tag_categories) if v}
+tag_categories_colors = [
+    "#b4c7d9",
+    "#f2ac08",
+    None,  # Invalid catid
+    "#d0d",
+    "#0a0",
+    "#ed5d1f",
+    "#ff3d3d",
+    "#fff",
+    "#282",
+    "wheat",
+]
+tag_categories_alt_colors = [
+    "#2e76b4",
+    "#fbd67f",
+    None,  # Invalid catid
+    "#ff5eff",
+    "#2bff2b",
+    "#f6b295",
+    "#ffbdbd",
+    "#666",
+    "#5fdb5f",
+    "#d0b27a",
+]
+def load_tags(data_dir):
+    """
+    Load tag data, returns a tuple `(tag2idx, idx2tag, tag_categories)`
+    * `tag2idx`: dict mapping tag and aliases to numerical ids
+    * `idx2tag`: list mapping numerical id to tag string
+    * `tag_categories`: byte string mapping numerical id to categories
+    """
+    data_dir = Path(data_dir)
+    with gzip.open(data_dir / "tags.txt.gz", "rt", encoding="utf-8") as fd:
+        idx2tag = fd.read().split("\n")
+        if not idx2tag[-1]:
+            idx2tag = idx2tag[:-1]
+    with gzip.open(data_dir / "tag2idx.json.gz", "rb") as fp:
+        tag2idx = json.load(fp)
+    with gzip.open(data_dir / "tags_categories.bin.gz", "rb") as fp:
+        tag_categories = fp.read()
+    logging.info(f"Loaded {len(idx2tag)} tags, {len(tag2idx)} tag2id mappings")
+    return tag2idx, idx2tag, tag_categories
+def load_implications(data_dir):
+    """
+    Load implication mappings. Returns a tuple `(implications, implications_rej)`
+    * `implications`: dict mapping numerical ids to a list of implied numerical
+      ids. Contains transitive implications.
+    * `implications_rej`: dict mapping tag strings to a list of implied
+      numerical ids. keys in implications_rej are tags that have a very little
+      usage (less than 2 posts) and don't have numerical ids associated with
+      them.
+    """
+    with gzip.open(data_dir / "implications.json.gz", "rb") as fp:
+        implications = json.load(fp)
+    implications = {int(k): v for k, v in implications.items()}
+    with gzip.open(data_dir / "implications_rej.json.gz", "rb") as fp:
+        implications_rej = json.load(fp)
+    logger.info(
+        f"Loaded {len(implications)} implications + {len(implications_rej)} implication from tags without id"
+    )
+    return implications, implications_rej
+def tag_rank_to_freq(rank: int) -> float:
+    """Approximate the frequency of a tag given its rank"""
+    return math.exp(26.4284 * math.tanh(2.93505 * rank ** (-0.136501)) - 11.492)
+def tag_freq_to_rank(freq: int) -> float:
+    """Approximate the rank of a tag given its frequency"""
+    log_freq = math.log(freq)
+    return math.exp(
+        -7.57186
+        * (0.0465456 * log_freq - 1.24326)
+        * math.log(1.13045 - 0.0720383 * log_freq)
+        + 12.1903
+    )
+InMapFun = Callable[[str, int | None], list[str]]
+OutMapFun = Callable[[str], list[str]]
+class TagNormalizer:
+    """
+    Map tag strings to numerical ids, and vice versa.
+    Multiple strings can be mapped to a single id, while each id maps to a
+    single string. As a result, the encode/decode process can be used to
+    normalize tags to canonical spelling.
+    See `add_input_mappings` for adding aliases, and `rename_output` for setting
+    the canonical spelling of a tag.
+    """
+    def __init__(self, path_or_data: str | Path | tuple[dict, list, bytes]):
+        if isinstance(path_or_data, (Path, str)):
+            data = load_tags(path_or_data)
+        else:
+            data = path_or_data
+        self.tag2idx, self.idx2tag, self.tag_categories = data
+    def get_category(self, tag: int | str, as_string=True) -> int:
+        if isinstance(tag, str):
+            tag = self.encode(tag)
+        cat = self.tag_categories[tag]
+        if as_string:
+            return tag_categories[cat]
+        return cat
+    def encode(self, tag: str, default=None):
+        "Convert tag string to numerical id"
+        return self.tag2idx.get(tag, default)
+    def decode(self, tag: int | str):
+        "Convert numerical id to tag string"
+        if isinstance(tag, str):
+            return tag
+        return self.idx2tag[tag]
+    def get_reverse_mapping(self):
+        """Return a list mapping id -> [ tag strings ]"""
+        res = [[] for i in range(len(self.idx2tag))]
+        for tag, tid in self.tag2idx.items():
+            res[tid].append(tag)
+        return res
+    def add_input_mappings(
+        self, tags: str | Iterable[str], to_tid: int | str, on_conflict="raise"
+    ):
+        """Associate tag strings to an id for recognition by `encode`
+        `on_conflict` defines what to do when the tag string is already mapped
+        to a different id:
+        * "raise": raise an ValueError (default)
+        * "warn": raise a warning
+        * "overwrite_rarest": make the tag point to the most frequently used tid
+        * "overwrite": silently overwrite the mapping
+        * "silent", or any other string: don't set the mapping
+        """
+        tag2idx = self.tag2idx
+        if not isinstance(to_tid, int):
+            to_tid = tag2idx[to_tid]
+        if isinstance(tags, str):
+            tags = (tags,)
+        for tag in tags:
+            conflict = tag2idx.get(tag, to_tid)
+            if conflict != to_tid:
+                msg = f"mapping {tag!r}->{self.idx2tag[to_tid]!r}({to_tid}) conflicts with previous mapping {tag!r}->{self.idx2tag[conflict]!r}({conflict})."
+                if on_conflict == "raise":
+                    raise ValueError(msg)
+                elif on_conflict == "warn":
+                    logger.warning(msg)
+                elif on_conflict == "overwrite_rarest" and to_tid > conflict:
+                    continue
+                elif on_conflict != "overwrite":
+                    continue
+            tag2idx[tag] = to_tid
+    def remove_input_mappings(self, tags: str | Iterable[str]):
+        """Remove tag strings from the mapping"""
+        if isinstance(tags, str):
+            tags = (tags,)
+        for tag in tags:
+            if tag in self.tag2idx:
+                del self.tag2idx[tag]
+            else:
+                logger.warning(f"tag {tag!r} is not a valid tag")
+    def rename_output(self, orig: int | str, dest: str):
+        """Change the tag string associated with an id. Used by `decode`."""
+        if not isinstance(orig, int):
+            orig = self.tag2idx[orig]
+        self.idx2tag[orig] = dest
+    def map_inputs(
+        self, mapfun: InMapFun, prepopulate=True, on_conflict="raise"
+    ) -> "TagNormalizer":
+        tag2idx = self.tag2idx.copy() if prepopulate else {}
+        res = type(self)((tag2idx, self.idx2tag, self.tag_categories))
+        for tag, tid in self.tag2idx.items():
+            res.add_input_mappings(mapfun(tag, tid), tid, on_conflict=on_conflict)
+        return res
+    def map_outputs(self, mapfun: OutMapFun) -> "TagNormalizer":
+        idx2tag = [mapfun(t, i) for i, t in enumerate(self.idx2tag)]
+        return type(self)((self.tag2idx, idx2tag, self.tag_categories))
+    def get(self, key: int | str, default=None):
+        """
+        Returns the string tag associated with a numerical id, or conversely,
+        the id associated with a tag.
+        """
+        if isinstance(key, int):
+            idx2tag = self.idx2tag
+            if key >= len(idx2tag):
+                return default
+            return idx2tag[key]
+        return self.tag2idx.get(key, default)
+class TagSetNormalizer:
+    def __init__(self, path_or_data: str | Path | tuple[TagNormalizer, dict, dict]):
+        if isinstance(path_or_data, (Path, str)):
+            data = TagNormalizer(path_or_data), *load_implications(path_or_data)
+        else:
+            data = path_or_data
+        self.tag_normalizer, self.implications, self.implications_rej = data
+    def map_inputs(self, mapfun: InMapFun, on_conflict="raise") -> "TagSetNormalizer":
+        tag_normalizer = self.tag_normalizer.map_inputs(mapfun, on_conflict=on_conflict)
+        implications_rej: dict[str, list[str]] = {}
+        for tag_string, implied_ids in self.implications_rej.items():
+            for new_tag_string in mapfun(tag_string, None):
+                conflict = implications_rej.get(new_tag_string, implied_ids)
+                if conflict != implied_ids:
+                    msg = f"mapping {tag_string!r}->{implied_ids} conflicts with previous mapping {tag_string!r}->{conflict}."
+                    if on_conflict == "raise":
+                        raise ValueError(msg)
+                    elif on_conflict == "warn":
+                        warnings.warn(msg)
+                    elif on_conflict != "overwrite":
+                        continue
+                implications_rej[new_tag_string] = implied_ids
+        res = type(self)((tag_normalizer, self.implications, implications_rej))
+        return res
+    def map_outputs(self, mapfun: OutMapFun) -> "TagSetNormalizer":
+        tag_normalizer = self.tag_normalizer.map_outputs(mapfun)
+        return type(self)((tag_normalizer, self.implications, self.implications_rej))
+    def get_implied(self, tag: int | str) -> list[int]:
+        if isinstance(tag, int):
+            return self.implications.get(tag, ())
+        else:
+            return self.implications_rej.get(tag, ())
+    def encode(
+        self,
+        tags: list[str],
+        keep_implied: bool | set[int] = False,
+        max_antecedent_rank: int | None = None,
+        drop_antecedent_rank: int | None = None,
+    ) -> tuple[list[int | str], set[int]]:
+        """
+        Encode a list of string as numerical ids and strip implied tags.
+        Unknown tags are returned as strings.
+        Returns :
+        * a list of tag ids and unknown tag strings,
+        * a list of implied tag ids.
+        """
+        tag2idx = self.tag_normalizer.tag2idx
+        N = len(tag2idx)
+        max_antecedent_rank = max_antecedent_rank or N + 1
+        drop_antecedent_rank = drop_antecedent_rank or N + 1
+        get_implied = self.implications.get
+        get_implied_rej = self.implications_rej.get
+        stack = [tag2idx.get(tag, tag) for tag in tags[::-1]]
+        implied = set()
+        res = dict()  # dict as a cheap ordered set
+        while stack:
+            tag = stack.pop()
+            if isinstance(tag, int):
+                antecedent_rank = tag
+                consequents = get_implied(tag)
+            else:
+                # the tag might be a very rare antecedent (less than two posts)
+                # that doesn't have a tag id
+                antecedent_rank = N
+                consequents = get_implied_rej(tag)
+            if consequents:
+                if antecedent_rank < max_antecedent_rank:
+                    implied.update(consequents)
+                else:
+                    # The implied tags from low frequency antecedent (high rank)
+                    # are added to the list and instead the antecedent may be
+                    # dropped
+                    stack.extend(consequents)
+                    if antecedent_rank >= drop_antecedent_rank:
+                        continue
+            res[tag] = None
+        res = res.keys()
+        if not keep_implied:
+            res = [t for t in res if t not in implied]
+        elif isinstance(keep_implied, set):
+            res = [t for t in res if t not in implied or t in keep_implied]
+        else:
+            res = list(res)
+        return res, implied
+    def decode(self, tags: Iterable[int | str]) -> list[str]:
+        idx2tag = self.tag_normalizer.idx2tag
+        return [idx2tag[t] if isinstance(t, int) else t for t in tags]