Upload model

Browse files

Files changed (5) hide show

config.json +15 -0
configuration_cased.py +26 -0
modeling_cased.py +252 -0
pytorch_model.bin +3 -0
transforms_cased.py +438 -0

config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "alpha": 0.5,
+  "architectures": [
+    "CaSEDModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_cased.CaSEDConfig",
+    "AutoModel": "modeling_cased.CaSEDModel"
+  },
+  "index_name": "cc12m",
+  "model_type": "cased",
+  "retrieval_num_results": 10,
+  "torch_dtype": "float32",
+  "transformers_version": "4.29.2"
+}

configuration_cased.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from transformers.modeling_utils import PretrainedConfig
+class CaSEDConfig(PretrainedConfig):
+    """Configuration class for CaSED.
+    Args:
+        index_name (str, optional): Name of the index. Defaults to "cc12m".
+        alpha (float, optional): Weight of the vision loss. Defaults to 0.5.
+        retrieval_num_results (int, optional): Number of results to return. Defaults to 10.
+    """
+    model_type = "cased"
+    is_composition = True
+    def __init__(
+        self,
+        index_name: str = "cc12m",
+        alpha: float = 0.5,
+        retrieval_num_results: int = 10,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.index_name = index_name
+        self.alpha = alpha
+        self.retrieval_num_results = retrieval_num_results

modeling_cased.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import os
+import tarfile
+from pathlib import Path
+from typing import Optional
+import faiss
+import numpy as np
+import pyarrow as pa
+import requests
+import torch
+from tqdm import tqdm
+from transformers import CLIPModel, CLIPProcessor
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_cased import CaSEDConfig
+from .transforms_cased import default_vocabulary_transforms
+DATABASES = {
+    "cc12m": {
+        "url": "https://storage-cased.alessandroconti.me/cc12m.tar.gz",
+        "cache_subdir": "./cc12m/vit-l-14/",
+    },
+}
+class MetadataProvider:
+    """Metadata provider.
+    It uses arrow files to store metadata and retrieve it efficiently.
+    Code reference:
+        - https://github.dev/rom1504/clip-retrieval
+    """
+    def __init__(self, arrow_folder: Path):
+        arrow_files = [str(a) for a in sorted(arrow_folder.glob("**/*")) if a.is_file()]
+        self.table = pa.concat_tables(
+            [
+                pa.ipc.RecordBatchFileReader(pa.memory_map(arrow_file, "r")).read_all()
+                for arrow_file in arrow_files
+            ]
+        )
+    def get(self, ids: np.ndarray, cols: Optional[list] = None):
+        """Get arrow metadata from ids.
+        Args:
+            ids (np.ndarray): Ids to retrieve.
+            cols (Optional[list], optional): Columns to retrieve. Defaults to None.
+        """
+        if cols is None:
+            cols = self.table.schema.names
+        else:
+            cols = list(set(self.table.schema.names) & set(cols))
+        t = pa.concat_tables([self.table[i:j] for i, j in zip(ids, ids + 1)])
+        return t.select(cols).to_pandas().to_dict("records")
+class CaSEDModel(PreTrainedModel):
+    """Transformers module for Category Search from External Databases (CaSED).
+    Reference:
+        - Conti et al. Vocabulary-free Image Classification. arXiv 2023.
+    Args:
+        config (CaSEDConfig): Configuration class for CaSED.
+    """
+    config_class = CaSEDConfig
+    def __init__(self, config: CaSEDConfig):
+        super().__init__(config)
+        # load CLIP
+        model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+        self.vision_encoder = model.vision_model
+        self.vision_proj = model.visual_projection
+        self.language_encoder = model.text_model
+        self.language_proj = model.text_projection
+        self.logit_scale = model.logit_scale.exp()
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        # load transforms
+        self.vocabulary_transforms = default_vocabulary_transforms()
+        # set hparams
+        self.hparams = {}
+        self.hparams["alpha"] = config.alpha
+        self.hparams["index_name"] = config.index_name
+        self.hparams["retrieval_num_results"] = config.retrieval_num_results
+        # set cache dir
+        self.hparams["cache_dir"] = Path(os.path.expanduser("~/.cache/cased"))
+        os.makedirs(self.hparams["cache_dir"], exist_ok=True)
+        # download databases
+        self.prepare_data()
+        # load faiss indices and metadata providers
+        self.resources = {}
+        for name, items in DATABASES.items():
+            database_path = self.hparams["cache_dir"] / "databases" / items["cache_subdir"]
+            text_index_fp = database_path / "text.index"
+            metadata_fp = database_path / "metadata/"
+            text_index = faiss.read_index(
+                str(text_index_fp), faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY
+            )
+            metadata_provider = MetadataProvider(metadata_fp)
+            self.resources[name] = {
+                "device": self.device,
+                "model": "ViT-L-14",
+                "text_index": text_index,
+                "metadata_provider": metadata_provider,
+            }
+    def prepare_data(self):
+        """Download data if needed."""
+        databases_path = Path(self.hparams["cache_dir"]) / "databases"
+        for name, items in DATABASES.items():
+            url = items["url"]
+            database_path = Path(databases_path, name)
+            if database_path.exists():
+                continue
+            # download data
+            target_path = Path(databases_path, name + ".tar.gz")
+            os.makedirs(target_path.parent, exist_ok=True)
+            with requests.get(url, stream=True) as r:
+                r.raise_for_status()
+                total_bytes_size = int(r.headers.get('content-length', 0))
+                chunk_size = 8192
+                p_bar = tqdm(
+                    desc="Downloading cc12m index",
+                    total=total_bytes_size,
+                    unit='iB',
+                    unit_scale=True,
+                )
+                with open(target_path, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=chunk_size):
+                        f.write(chunk)
+                        p_bar.update(len(chunk))
+                p_bar.close()
+            # extract data
+            tar = tarfile.open(target_path, "r:gz")
+            tar.extractall(target_path.parent)
+            tar.close()
+            target_path.unlink()
+    @torch.no_grad()
+    def query_index(self, sample_z: torch.Tensor) -> torch.Tensor:
+        """Query the external database index.
+        Args:
+            sample_z (torch.Tensor): Sample to query the index.
+        """
+        # get the index
+        resources = self.resources[self.hparams["index_name"]]
+        text_index = resources["text_index"]
+        metadata_provider = resources["metadata_provider"]
+        # query the index
+        sample_z = sample_z.squeeze(0)
+        sample_z = sample_z / sample_z.norm(dim=-1, keepdim=True)
+        query_input = sample_z.cpu().detach().numpy().tolist()
+        query = np.expand_dims(np.array(query_input).astype("float32"), 0)
+        distances, idxs, _ = text_index.search_and_reconstruct(
+            query, self.hparams["retrieval_num_results"]
+        )
+        results = idxs[0]
+        nb_results = np.where(results == -1)[0]
+        nb_results = nb_results[0] if len(nb_results) > 0 else len(results)
+        indices = results[:nb_results]
+        distances = distances[0][:nb_results]
+        if len(distances) == 0:
+            return []
+        # get the metadata
+        results = []
+        metadata = metadata_provider.get(indices[:20], ["caption"])
+        for key, (d, i) in enumerate(zip(distances, indices)):
+            output = {}
+            meta = None if key + 1 > len(metadata) else metadata[key]
+            if meta is not None:
+                output.update(meta)
+            output["id"] = i.item()
+            output["similarity"] = d.item()
+            results.append(output)
+        # get the captions only
+        vocabularies = [result["caption"] for result in results]
+        return vocabularies
+    @torch.no_grad()
+    def forward(self, images: dict, alpha: Optional[float] = None) -> torch.Tensor():
+        """Forward pass.
+        Args:
+            images (dict): Dictionary with the images. The expected keys are:
+                - pixel_values (torch.Tensor): Pixel values of the images.
+            alpha (Optional[float]): Alpha value for the interpolation.
+        """
+        # forward the images
+        images["pixel_values"] = images["pixel_values"].to(self.device)
+        images_z = self.vision_proj(self.vision_encoder(**images)[1])
+        vocabularies, samples_p = [], []
+        for image_z in images_z:
+            # generate a single text embedding from the unfiltered vocabulary
+            vocabulary = self.query_index(image_z)
+            text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
+            text["input_ids"] = text["input_ids"][:, :77].to(self.device)
+            text["attention_mask"] = text["attention_mask"][:, :77].to(self.device)
+            text_z = self.language_encoder(**text)[1]
+            text_z = self.language_proj(text_z)
+            # filter the vocabulary, embed it, and get its mean embedding
+            vocabulary = self.vocabulary_transforms(vocabulary) or ["object"]
+            text = self.processor(text=vocabulary, return_tensors="pt", padding=True)
+            text = {k: v.to(self.device) for k, v in text.items()}
+            vocabulary_z = self.language_encoder(**text)[1]
+            vocabulary_z = self.language_proj(vocabulary_z)
+            vocabulary_z = vocabulary_z / vocabulary_z.norm(dim=-1, keepdim=True)
+            # get the image and text predictions
+            image_z = image_z / image_z.norm(dim=-1, keepdim=True)
+            text_z = text_z / text_z.norm(dim=-1, keepdim=True)
+            image_p = (torch.matmul(image_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
+            text_p = (torch.matmul(text_z, vocabulary_z.T) * self.logit_scale).softmax(dim=-1)
+            # average the image and text predictions
+            alpha = alpha or self.hparams["alpha"]
+            sample_p = alpha * image_p + (1 - alpha) * text_p
+            # save the results
+            samples_p.append(sample_p)
+            vocabularies.append(vocabulary)
+        # get the scores
+        samples_p = torch.stack(samples_p, dim=0)
+        scores = sample_p.cpu().tolist()
+        # define the results
+        results = {"vocabularies": vocabularies, "scores": scores}
+        return results

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91c5a2012ab49580ef33645ef578ab2eab491ace7ed63e856f9ef340f73e0e9e
+size 1710665929

transforms_cased.py ADDED Viewed

	@@ -0,0 +1,438 @@

+import re
+from abc import ABC, abstractmethod
+from typing import Any, Union
+import inflect
+import nltk
+from flair.data import Sentence
+from flair.models import SequenceTagger
+__all__ = [
+    "DropFileExtensions",
+    "DropNonAlpha",
+    "DropShortWords",
+    "DropSpecialCharacters",
+    "DropTokens",
+    "DropURLs",
+    "DropWords",
+    "FilterPOS",
+    "FrequencyMinWordCount",
+    "FrequencyTopK",
+    "ReplaceSeparators",
+    "ToLowercase",
+    "ToSingular",
+]
+class BaseTextTransform(ABC):
+    """Base class for string transforms."""
+    @abstractmethod
+    def __call__(self, text: str):
+        raise NotImplementedError
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class DropFileExtensions(BaseTextTransform):
+    """Remove file extensions from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove file extensions from.
+        """
+        text = re.sub(r"\.\w+", "", text)
+        return text
+class DropNonAlpha(BaseTextTransform):
+    """Remove non-alpha words from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove non-alpha words from.
+        """
+        text = re.sub(r"[^a-zA-Z\s]", "", text)
+        return text
+class DropShortWords(BaseTextTransform):
+    """Remove short words from the input text.
+    Args:
+        min_length (int): Minimum length of words to keep.
+    """
+    def __init__(self, min_length) -> None:
+        super().__init__()
+        self.min_length = min_length
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove short words from.
+        """
+        text = " ".join([word for word in text.split() if len(word) >= self.min_length])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(min_length={self.min_length})"
+class DropSpecialCharacters(BaseTextTransform):
+    """Remove special characters from the input text.
+    Special characters are defined as any character that is not a word character, whitespace,
+    hyphen, period, apostrophe, or ampersand.
+    """
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove special characters from.
+        """
+        text = re.sub(r"[^\w\s\-\.\'\&]", "", text)
+        return text
+class DropTokens(BaseTextTransform):
+    """Remove tokens from the input text.
+    Tokens are defined as strings enclosed in angle brackets, e.g. <token>.
+    """
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove tokens from.
+        """
+        text = re.sub(r"<[^>]+>", "", text)
+        return text
+class DropURLs(BaseTextTransform):
+    """Remove URLs from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove URLs from.
+        """
+        text = re.sub(r"http\S+", "", text)
+        return text
+class DropWords(BaseTextTransform):
+    """Remove words from the input text.
+    It is case-insensitive and supports singular and plural forms of the words.
+    """
+    def __init__(self, words: list[str]) -> None:
+        super().__init__()
+        self.words = words
+        self.pattern = r"\b(?:{})\b".format("|".join(words))
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove words from.
+        """
+        text = re.sub(self.pattern, "", text, flags=re.IGNORECASE)
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(pattern={self.pattern})"
+class FilterPOS(BaseTextTransform):
+    """Filter words by POS tags.
+    Args:
+        tags (list): List of POS tags to remove.
+        engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
+        keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
+    """
+    def __init__(self, tags: list, engine: str = "nltk", keep_compound_nouns: bool = True) -> None:
+        super().__init__()
+        self.tags = tags
+        self.engine = engine
+        self.keep_compound_nouns = keep_compound_nouns
+        if engine == "nltk":
+            nltk.download("averaged_perceptron_tagger", quiet=True)
+            nltk.download("punkt", quiet=True)
+            self.tagger = lambda x: nltk.pos_tag(nltk.word_tokenize(x))
+        elif engine == "flair":
+            self.tagger = SequenceTagger.load("flair/pos-english-fast").predict
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove words with specific POS tags from.
+        """
+        if self.engine == "nltk":
+            word_tags = self.tagger(text)
+            text = " ".join([word for word, tag in word_tags if tag not in self.tags])
+        elif self.engine == "flair":
+            sentence = Sentence(text)
+            self.tagger(sentence)
+            text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
+        if self.keep_compound_nouns:
+            compound_nouns = []
+            if self.engine == "nltk":
+                for i in range(len(word_tags) - 1):
+                    if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
+                        # if they are the same word, skip
+                        if word_tags[i][0] == word_tags[i + 1][0]:
+                            continue
+                        compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
+                        compound_nouns.append(compound_noun)
+            elif self.engine == "flair":
+                for i in range(len(sentence.tokens) - 1):
+                    if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
+                        # if they are the same word, skip
+                        if sentence.tokens[i].text == sentence.tokens[i + 1].text:
+                            continue
+                        compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
+                        compound_nouns.append(compound_noun)
+            text = " ".join([text, " ".join(compound_nouns)])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(tags={self.tags}, engine={self.engine})"
+class FrequencyMinWordCount(BaseTextTransform):
+    """Keep only words that occur more than a minimum number of times in the input text.
+    If the threshold is too strong and no words pass the threshold, the threshold is reduced to
+    the most frequent word.
+    Args:
+        min_count (int): Minimum number of occurrences of a word to keep.
+    """
+    def __init__(self, min_count) -> None:
+        super().__init__()
+        self.min_count = min_count
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove infrequent words from.
+        """
+        if self.min_count <= 1:
+            return text
+        words = text.split()
+        word_counts = {word: words.count(word) for word in words}
+        # if nothing passes the threshold, reduce the threshold to the most frequent word
+        max_word_count = max(word_counts.values() or [0])
+        min_count = max_word_count if self.min_count > max_word_count else self.min_count
+        text = " ".join([word for word in words if word_counts[word] >= min_count])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(min_count={self.min_count})"
+class FrequencyTopK(BaseTextTransform):
+    """Keep only the top k most frequent words in the input text.
+    In case of a tie, all words with the same count as the last word are kept.
+    Args:
+        top_k (int): Number of top words to keep.
+    """
+    def __init__(self, top_k: int) -> None:
+        super().__init__()
+        self.top_k = top_k
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove infrequent words from.
+        """
+        if self.top_k < 1:
+            return text
+        words = text.split()
+        word_counts = {word: words.count(word) for word in words}
+        top_words = sorted(word_counts, key=word_counts.get, reverse=True)
+        # in case of a tie, keep all words with the same count
+        top_words = top_words[: self.top_k]
+        top_words = [word for word in top_words if word_counts[word] == word_counts[top_words[-1]]]
+        text = " ".join([word for word in words if word in top_words])
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(top_k={self.top_k})"
+class ReplaceSeparators(BaseTextTransform):
+    """Replace underscores and dashes with spaces."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to replace separators in.
+        """
+        text = re.sub(r"[_\-]", " ", text)
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+class RemoveDuplicates(BaseTextTransform):
+    """Remove duplicate words from the input text."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to remove duplicate words from.
+        """
+        text = " ".join(list(set(text.split())))
+        return text
+class TextCompose:
+    """Compose several transforms together.
+    It differs from the torchvision.transforms.Compose class in that it applies the transforms to
+    a string instead of a PIL Image or Tensor. In addition, it automatically join the list of
+    input strings into a single string and splits the output string into a list of words.
+    Args:
+        transforms (list): List of transforms to compose.
+    """
+    def __init__(self, transforms: list[BaseTextTransform]) -> None:
+        self.transforms = transforms
+    def __call__(self, text: Union[str, list[str]]) -> Any:
+        if isinstance(text, list):
+            text = " ".join(text)
+        for t in self.transforms:
+            text = t(text)
+        return text.split()
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += f"    {t}"
+        format_string += "\n)"
+        return format_string
+class ToLowercase(BaseTextTransform):
+    """Convert text to lowercase."""
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to convert to lowercase.
+        """
+        text = text.lower()
+        return text
+class ToSingular(BaseTextTransform):
+    """Convert plural words to singular form."""
+    def __init__(self) -> None:
+        super().__init__()
+        self.transform = inflect.engine().singular_noun
+    def __call__(self, text: str):
+        """
+        Args:
+            text (str): Text to convert to singular form.
+        """
+        words = text.split()
+        for i, word in enumerate(words):
+            if not word.endswith("s"):
+                continue
+            if word[-2:] in ["ss", "us", "is"]:
+                continue
+            if word[-3:] in ["ies", "oes"]:
+                continue
+            words[i] = self.transform(word) or word
+        text = " ".join(words)
+        return text
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}()"
+def default_vocabulary_transforms() -> TextCompose:
+    """Preprocess input text with preprocessing transforms."""
+    words_to_drop = [
+        "image",
+        "photo",
+        "picture",
+        "thumbnail",
+        "logo",
+        "symbol",
+        "clipart",
+        "portrait",
+        "painting",
+        "illustration",
+        "icon",
+        "profile",
+    ]
+    pos_tags = ["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "VBG", "VBN"]
+    transforms = []
+    transforms.append(DropTokens())
+    transforms.append(DropURLs())
+    transforms.append(DropSpecialCharacters())
+    transforms.append(DropFileExtensions())
+    transforms.append(ReplaceSeparators())
+    transforms.append(DropShortWords(min_length=3))
+    transforms.append(DropNonAlpha())
+    transforms.append(ToLowercase())
+    transforms.append(ToSingular())
+    transforms.append(DropWords(words=words_to_drop))
+    transforms.append(FrequencyMinWordCount(min_count=2))
+    transforms.append(FilterPOS(tags=pos_tags, engine="flair", keep_compound_nouns=False))
+    transforms.append(RemoveDuplicates())
+    transforms = TextCompose(transforms)
+    return transforms