Spaces:

HUANG-Stephanie
/

cvquest-colpali

Running

App Files Files Community

HUANG-Stephanie commited on Jul 31

Commit

c4d37d5

•

1 Parent(s): d106c36

Delete colpali_engine

Browse files

Files changed (43) hide show

colpali_engine/__init__.py +0 -0
colpali_engine/__pycache__/__init__.cpython-310.pyc +0 -0
colpali_engine/dataset/__init__.py +0 -0
colpali_engine/dataset/custom_collator.py +0 -244
colpali_engine/dataset/hf_dataset_names.py +0 -52
colpali_engine/evaluation/__init__.py +0 -1
colpali_engine/evaluation/eval_manager.py +0 -178
colpali_engine/interpretability/__init__.py +0 -4
colpali_engine/interpretability/gen_interpretability_plots.py +0 -113
colpali_engine/interpretability/plot_utils.py +0 -131
colpali_engine/interpretability/processor.py +0 -116
colpali_engine/interpretability/torch_utils.py +0 -60
colpali_engine/interpretability/vit_configs.py +0 -23
colpali_engine/loss/__init__.py +0 -1
colpali_engine/loss/colbert_loss.py +0 -122
colpali_engine/models/__init__.py +0 -0
colpali_engine/models/__pycache__/__init__.cpython-310.pyc +0 -0
colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc +0 -0
colpali_engine/models/clip_baselines.py +0 -144
colpali_engine/models/colbert_architectures.py +0 -177
colpali_engine/models/idefics_colbert_architecture.py +0 -57
colpali_engine/models/paligemma_colbert_architecture.py +0 -191
colpali_engine/trainer/__init__.py +0 -0
colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc +0 -0
colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc +0 -0
colpali_engine/trainer/contrastive_trainer.py +0 -64
colpali_engine/trainer/retrieval_evaluator.py +0 -72
colpali_engine/utils/__init__.py +0 -0
colpali_engine/utils/__pycache__/__init__.cpython-310.pyc +0 -0
colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc +0 -0
colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc +0 -0
colpali_engine/utils/colidefics_processing_utils.py +0 -53
colpali_engine/utils/colpali_processing_utils.py +0 -36
colpali_engine/utils/dataset_transformation.py +0 -158
colpali_engine/utils/gpu_stats.py +0 -24
colpali_engine/utils/image_from_page_utils.py +0 -21
colpali_engine/utils/image_utils.py +0 -64
colpali_engine/utils/iter_utils.py +0 -42
colpali_engine/utils/pdf_utils.py +0 -87
colpali_engine/utils/plot_utils.py +0 -6
colpali_engine/utils/torch_utils.py +0 -18
colpali_engine/utils/train_colpali_engine_models.py +0 -247
colpali_engine/utils/wrapper.py +0 -83

colpali_engine/__init__.py DELETED Viewed

File without changes

colpali_engine/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (163 Bytes)

colpali_engine/dataset/__init__.py DELETED Viewed

File without changes

colpali_engine/dataset/custom_collator.py DELETED Viewed

@@ -1,244 +0,0 @@
-from transformers import PreTrainedTokenizer, ProcessorMixin
-class CustomCollator:
-    def __init__(
-        self,
-        processor: ProcessorMixin = None,
-        tokenizer: PreTrainedTokenizer = None,
-        max_length: int = 2048,
-        add_suffix: bool = False,
-    ):
-        self.processor = processor
-        self.tokenizer = tokenizer
-        self.image_token_id = None
-        self.max_length = max_length
-        self.suffix = ""
-        if add_suffix:
-            self.suffix = "\n" * 10
-        if tokenizer is None and processor is None:
-            raise ValueError("Either processor or tokenizer should be provided.")
-        if self.processor is not None:
-            if self.processor.__class__.__name__ != "SiglipProcessor":
-                self.image_token_id = self.processor.tokenizer.additional_special_tokens_ids[
-                    self.processor.tokenizer.additional_special_tokens.index("<image>")
-                ]
-            if self.tokenizer is not None:
-                raise ValueError("Only one of processor or tokenizer should be provided.")
-        if self.tokenizer and self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-    def __call__(self, examples):
-        if self.processor is None:
-            return self.forward_text(examples)
-        if self.processor.__class__.__name__ == "Idefics2Processor":
-            return self.forward_vision_idefics(examples)
-        if self.processor.__class__.__name__ == "PaliGemmaProcessor":
-            return self.forward_vision_pali(examples)
-        if self.processor.__class__.__name__ == "SiglipProcessor":
-            return self.forward_vision_siglip(examples)
-        raise ValueError("Processor not supported")
-    def forward_text(self, examples):
-        texts_doc = []
-        texts_query = []
-        for example in examples:
-            text_query = example["query"] + self.suffix
-            text_doc = example["doc"]
-            texts_doc.append(text_doc.strip())
-            texts_query.append(text_query.strip())
-        batch_doc = self.tokenizer(
-            texts_doc, max_length=self.max_length, padding="longest", truncation=True, return_tensors="pt"
-        )
-        batch_query = self.tokenizer(
-            texts_query, max_length=self.max_length, padding="longest", truncation=True, return_tensors="pt"
-        )
-        # prefix each key with "doc_" or "query_" to avoid key conflicts
-        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
-        batch_query = {f"query_{k}": v for k, v in batch_query.items()}
-        batch_doc.update(batch_query)
-        return batch_doc
-    def forward_vision_idefics(self, examples):
-        texts_doc = []
-        texts_query = []
-        images = []
-        for example in examples:
-            image = example["image"]
-            text_query = None
-            if example["query"] is not None:
-                query = example["query"]
-                messages_query = [
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": f"Question: {query}<end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance>",
-                            },
-                        ],
-                    },
-                ]
-                text_query = self.processor.apply_chat_template(messages_query, add_generation_prompt=False).strip()
-            messages_doc = [
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "Describe the image."},
-                        {"type": "image"},
-                    ],
-                },
-            ]
-            text_doc = self.processor.apply_chat_template(messages_doc, add_generation_prompt=False)
-            texts_doc.append(text_doc.strip())
-            texts_query.append(text_query)
-            images.append([image])
-        batch_doc = self.processor(
-            text=texts_doc, images=images, return_tensors="pt", padding="longest", max_length=self.max_length
-        )
-        batch_query = None
-        if all([t is None for t in texts_query]):
-            print("All queries are None. Returning None for all queries.")
-        elif any([t is None for t in texts_query]):
-            raise ValueError("Some queries are None. This collator does not support None queries yet.")
-        else:
-            batch_query = self.processor(
-                text=texts_query, return_tensors="pt", padding="longest", max_length=self.max_length
-            )
-        # prefix each key with "doc_" or "query_" to avoid key conflicts
-        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
-        if batch_query is not None:
-            batch_query = {f"query_{k}": v for k, v in batch_query.items()}
-            batch_doc.update(batch_query)
-        return batch_doc
-    def forward_vision_pali(self, examples):
-        texts_doc = []
-        texts_query = []
-        images = []
-        for example in examples:
-            if example["image"] is None:
-                raise ValueError("Image is None - This collator does not support None images yet.")
-            image = example["image"].convert("RGB")
-            images.append(image)
-            texts_doc.append("Describe the image.")
-            if example["query"] is None:
-                texts_query.append(None)
-            else:
-                query = example["query"]
-                query = f"Question: {query}<unused0><unused0><unused0><unused0><unused0>"
-                texts_query.append(query)
-        batch_doc = self.processor(
-            text=texts_doc,
-            images=images,
-            return_tensors="pt",
-            padding="longest",
-            max_length=self.max_length + self.processor.image_seq_length,
-        )
-        batch_query = None
-        # check if some but not all queries are None
-        if all([t is None for t in texts_query]):
-            print("All queries are None. Returning None for all queries.")
-        elif any([t is None for t in texts_query]):
-            raise ValueError("Some queries are None. This collator does not support None queries yet.")
-        else:
-            batch_query = self.processor(
-                images=images,  # NOTE: the image is not used in batch_query but it is required for calling the processor
-                text=texts_query,
-                return_tensors="pt",
-                padding="longest",
-                max_length=self.max_length + self.processor.image_seq_length,
-            )
-            del batch_query["pixel_values"]
-            batch_query["input_ids"] = batch_query["input_ids"][..., self.processor.image_seq_length :]
-            batch_query["attention_mask"] = batch_query["attention_mask"][..., self.processor.image_seq_length :]
-        # prefix each key with "doc_" or "query_" to avoid key conflicts
-        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
-        if batch_query is not None:
-            batch_query = {f"query_{k}": v for k, v in batch_query.items()}
-            batch_doc.update(batch_query)
-        return batch_doc
-    def forward_vision_siglip(self, examples):
-        texts_doc = []
-        texts_query = []
-        images = []
-        for example in examples:
-            if example["image"] is None:
-                raise ValueError("Image is None - This collator does not support None images yet.")
-            image = example["image"].convert("RGB")
-            images.append(image)
-            texts_doc.append("Describe the image.")
-            if example["query"] is None:
-                texts_query.append(None)
-            else:
-                query = f"Question: {example['query']}"
-                texts_query.append(query)
-        batch_doc = self.processor(
-            text=texts_doc,
-            images=images,
-            return_tensors="pt",
-            padding="max_length",
-            truncation=True,
-        )
-        batch_query = None
-        # check if some but not all queries are None
-        if all([t is None for t in texts_query]):
-            # print("All queries are None.")
-            pass
-        elif any([t is None for t in texts_query]):
-            raise ValueError("Some queries are None. This collator does not support None queries yet.")
-        else:
-            batch_query = self.processor(
-                images=images,
-                text=texts_query,
-                return_tensors="pt",
-                padding="max_length",
-                max_length=self.max_length,
-                truncation=True,
-            )
-            del batch_query["pixel_values"]
-        # prefix each key with "doc_" or "query_" to avoid key conflicts
-        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
-        if batch_query is not None:
-            batch_query = {f"query_{k}": v for k, v in batch_query.items()}
-            batch_doc.update(batch_query)
-            # add attention mask for queries
-            batch_doc["query_attention_mask"] = batch_doc["query_input_ids"].ne(0).long()
-        # add attention mask for docs
-        batch_doc["doc_attention_mask"] = batch_doc["doc_input_ids"].ne(0).long()
-        return batch_doc

colpali_engine/dataset/hf_dataset_names.py DELETED Viewed

@@ -1,52 +0,0 @@
-from enum import Enum
-class TrainDatasets(Enum):
-    """
-    Dataset names for the training datasets used in HuggingFace Datasets.
-    """
-    government_reports = "vidore/syntheticDocQA_government_reports_train"
-    healthcare_industry = "vidore/syntheticDocQA_healthcare_industry_train"
-    energy = "vidore/syntheticDocQA_energy_train"
-    artificial_intelligence = "vidore/syntheticDocQA_artificial_intelligence_train"
-    arxivqa = "vidore/arxivqa_train"
-    docvqa = "vidore/docvqa_train"
-    infovqa = "vidore/infovqa_train"
-    tatqa = "vidore/tatqa_train"
-    @staticmethod
-    def get_synthetic_datasets():
-        return [
-            TrainDatasets.government_reports,
-            TrainDatasets.healthcare_industry,
-            TrainDatasets.energy,
-            TrainDatasets.artificial_intelligence,
-        ]
-class TestImagesDirpath(Enum):
-    """
-    Dataset names for the test datasets used in HuggingFace Datasets.
-    """
-    government_reports = "data/government_reports"
-    healthcare_industry = "data/healthcare_industry"
-    energy = "data/energy"
-    artificial_intelligence = "data/scrapped_pdfs_split/pages_extracted/artificial_intelligence_test"
-    arxivqa = "data/arxivqa"
-    docvqa = "data/docvqa"
-    infovqa = "data/infovqa"
-    tatqa = "data/tatqa"
-class CaptionedSyntheticDatasets(Enum):
-    """
-    Dataset names for the captioned synthetic datasets used in HuggingFace Datasets.
-    """
-    shift = "vidore/baseline_cap_shiftproject_test"
-class SyntheticDocQATest(Enum):
-    shift = "vidore/shiftproject_test"

colpali_engine/evaluation/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .eval_manager import EvalManager

colpali_engine/evaluation/eval_manager.py DELETED Viewed

@@ -1,178 +0,0 @@
-from __future__ import annotations
-from pathlib import Path
-from typing import Any, ClassVar, Dict, Optional
-import pandas as pd
-class EvalManager:
-    """
-    Stores evaluation results for various datasets and metrics.
-    The data is stored in a pandas DataFrame with a MultiIndex for columns.
-    The first level of the MultiIndex is the dataset name and the second level is the metric name.
-    Usage:
-    >>> evaluator = Evaluator.from_dirpath("data/evaluation_results/")
-    >>> print(evaluator.data)
-    """
-    model_col: ClassVar[str] = "model"
-    dataset_col: ClassVar[str] = "dataset"
-    metric_col: ClassVar[str] = "metric"
-    def __init__(self, data: Optional[pd.DataFrame] = None):
-        if data is None:
-            data = pd.DataFrame()
-        self._df = data
-        self._df.index = self._df.index.rename(EvalManager.model_col)
-    def __str__(self) -> str:
-        return self.data.__str__()
-    @staticmethod
-    def from_dict(data: Dict[Any, Any]) -> EvalManager:
-        """
-        Load evaluation results from a dictionary.
-        Expected format:
-        {
-            "model1": pd.read_json(path1).T.stack(),
-            "model2": pd.read_json(path2).T.stack(),
-        }
-        """
-        df = pd.DataFrame.from_dict(data, orient="index")
-        return EvalManager(df)
-    @staticmethod
-    def from_json(path: str | Path) -> EvalManager:
-        datapath = Path(path)
-        if not datapath.is_file():
-            raise FileNotFoundError(f"{path} is not a file")
-        data = {}
-        data[datapath.stem] = pd.read_json(datapath).T.stack()  # pylint: disable=no-member
-        return EvalManager.from_dict(data)
-    @staticmethod
-    def from_dir(datadir: str | Path) -> EvalManager:
-        datadir_ = Path(datadir)
-        if not datadir_.is_dir():
-            raise FileNotFoundError(f"{datadir} is not a directory")
-        eval_files = list(datadir_.glob("*.json"))
-        data = {}
-        for filepath in eval_files:
-            data[filepath.stem] = pd.read_json(filepath).T.stack()  # pylint: disable=no-member
-        return EvalManager.from_dict(data)
-    @staticmethod
-    def from_csv(path: str | Path) -> EvalManager:
-        """
-        Load evaluation results from a CSV file.
-        """
-        try:
-            df = pd.read_csv(path, index_col=0, header=[0, 1])
-            return EvalManager(df)
-        except Exception as e:
-            print(f"Error loading {path}: {e}")
-            raise e
-    @property
-    def data(self) -> pd.DataFrame:
-        """
-        Returns the evaluation results as a pandas DataFrame.
-        """
-        return self._df.copy()
-    @property
-    def models(self) -> pd.Index:
-        """
-        Returns the models for which there are evaluation results.
-        """
-        return self.data.index
-    @property
-    def datasets(self) -> pd.Index:
-        """
-        Returns the datasets for which there are evaluation results.
-        """
-        return self.data.columns.get_level_values(0).unique()
-    @property
-    def metrics(self) -> pd.Index:
-        """
-        Returns the metrics for which there are evaluation results.
-        """
-        return self.data.columns.get_level_values(1)
-    @staticmethod
-    def melt(df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Melt a suitable DataFrame (e.g. returned by `get_df_for_dataset` and
-        `get_df_for_metric`) into a 'long' format.
-        """
-        return df.T.reset_index(names=[EvalManager.dataset_col, EvalManager.metric_col]).melt(
-            id_vars=[EvalManager.dataset_col, EvalManager.metric_col],
-            var_name=EvalManager.model_col,
-            value_name="score",
-        )
-    @property
-    def melted(self) -> pd.DataFrame:
-        """
-        Returns the evaluation results as a 'melted' DataFrame.
-        Useful for plotting with seaborn.
-        """
-        return EvalManager.melt(self.data)
-    def get_df_for_model(self, model: str) -> pd.DataFrame:
-        if model not in self.data.index:
-            raise ValueError(f"Model {model} not found in the evaluation results")
-        return self.data.loc[[model], :]  # type: ignore
-    def get_df_for_dataset(self, dataset: str) -> pd.DataFrame:
-        if dataset not in self.datasets:
-            raise ValueError(f"Dataset {dataset} not found in the evaluation results")
-        return self.data.loc[:, (dataset, slice(None))]  # type: ignore
-    def get_df_for_metric(self, metric: str) -> pd.DataFrame:
-        if metric not in self.metrics:
-            raise ValueError(f"Metric {metric} not found in the evaluation results")
-        return self.data.loc[:, (slice(None), metric)]  # type: ignore
-    def sort_by_dataset(self, ascending: bool = True) -> EvalManager:
-        """
-        Sort the evaluation results by dataset name.
-        """
-        df = self.data.T.sort_index(level=0, ascending=ascending).T
-        return EvalManager(df)
-    def sort_by_metric(self, ascending: bool = True) -> EvalManager:
-        """
-        Sort the evaluation results by metric name.
-        """
-        df = self.data.T.sort_index(level=1, ascending=ascending).T
-        return EvalManager(df)
-    def sort_columns(self, ascending: bool = True) -> EvalManager:
-        """
-        Sort the evaluation results by dataset name and then by metric name.
-        """
-        df = self.data.T.sort_index(level=[0, 1], ascending=ascending).T
-        return EvalManager(df)
-    def to_csv(self, path: str | Path):
-        """
-        Save the evaluation results to a CSV file.
-        Using `Evaluation.from_csv(path_to_saved_csv)` will load the evaluation results back into memory.
-        """
-        savepath = Path(path)
-        savepath.parent.mkdir(parents=True, exist_ok=True)
-        self.data.to_csv(savepath)

colpali_engine/interpretability/__init__.py DELETED Viewed

@@ -1,4 +0,0 @@
-from .plot_utils import *
-from .processor import *
-from .torch_utils import *
-from .vit_configs import *

colpali_engine/interpretability/gen_interpretability_plots.py DELETED Viewed

@@ -1,113 +0,0 @@
-import pprint
-from dataclasses import asdict, dataclass
-from pathlib import Path
-from uuid import uuid4
-import matplotlib.pyplot as plt
-import torch
-from einops import rearrange
-from PIL import Image
-from tqdm import trange
-from colpali_engine.interpretability.plot_utils import plot_patches
-from colpali_engine.interpretability.processor import ColPaliProcessor
-from colpali_engine.interpretability.torch_utils import normalize_attention_map_per_query_token
-from colpali_engine.interpretability.vit_configs import VIT_CONFIG
-from colpali_engine.models.paligemma_colbert_architecture import ColPali
-OUTDIR_INTERPRETABILITY = Path("outputs/interpretability")
-@dataclass
-class InterpretabilityInput:
-    query: str
-    image: Image.Image
-    start_idx_token: int
-    end_idx_token: int
-def generate_interpretability_plots(
-    model: ColPali,
-    processor: ColPaliProcessor,
-    query: str,
-    image: Image.Image,
-    savedir: str | Path | None = None,
-    add_special_prompt_to_doc: bool = True,
-) -> None:
-    # Sanity checks
-    if len(model.active_adapters()) != 1:
-        raise ValueError("The model must have exactly one active adapter.")
-    if model.config.name_or_path not in VIT_CONFIG:
-        raise ValueError("The model must be referred to in the VIT_CONFIG dictionary.")
-    vit_config = VIT_CONFIG[model.config.name_or_path]
-    # Handle savepath
-    if not savedir:
-        savedir = OUTDIR_INTERPRETABILITY / str(uuid4())
-        print(f"No savepath provided. Results will be saved to: `{savedir}`.")
-    elif isinstance(savedir, str):
-        savedir = Path(savedir)
-    savedir.mkdir(parents=True, exist_ok=True)
-    # Resize the image to square
-    input_image_square = image.resize((vit_config.resolution, vit_config.resolution))
-    # Preprocess the inputs
-    input_text_processed = processor.process_text(query).to(model.device)
-    input_image_processed = processor.process_image(image, add_special_prompt=add_special_prompt_to_doc).to(
-        model.device
-    )
-    # Forward pass
-    with torch.no_grad():
-        output_text = model.forward(**asdict(input_text_processed))  # (1, n_text_tokens, hidden_dim)
-    # NOTE: `output_image`` will have shape:
-    # (1, n_patch_x * n_patch_y, hidden_dim) if `add_special_prompt_to_doc` is False
-    # (1, n_patch_x * n_patch_y + n_special_tokens, hidden_dim) if `add_special_prompt_to_doc` is True
-    with torch.no_grad():
-        output_image = model.forward(**asdict(input_image_processed))
-    if add_special_prompt_to_doc:  # remove the special tokens
-        output_image = output_image[
-            :, : processor.processor.image_seq_length, :
-        ]  # (1, n_patch_x * n_patch_y, hidden_dim)
-    output_image = rearrange(
-        output_image, "b (h w) c -> b h w c", h=vit_config.n_patch_per_dim, w=vit_config.n_patch_per_dim
-    )  # (1, n_patch_x, n_patch_y, hidden_dim)
-    # Get the unnormalized attention map
-    attention_map = torch.einsum(
-        "bnk,bijk->bnij", output_text, output_image
-    )  # (1, n_text_tokens, n_patch_x, n_patch_y)
-    attention_map_normalized = normalize_attention_map_per_query_token(
-        attention_map
-    )  # (1, n_text_tokens, n_patch_x, n_patch_y)
-    attention_map_normalized = attention_map_normalized.float()
-    # Get text token information
-    n_tokens = input_text_processed.input_ids.size(1)
-    text_tokens = processor.tokenizer.tokenize(processor.decode(input_text_processed.input_ids[0]))
-    print("Text tokens:")
-    pprint.pprint(text_tokens)
-    print("\n")
-    for token_idx in trange(1, n_tokens - 1, desc="Iterating over tokens..."):  # exclude the <bos> and the "\n" tokens
-        fig, axis = plot_patches(
-            input_image_square,
-            vit_config.patch_size,
-            vit_config.resolution,
-            patch_opacities=attention_map_normalized[0, token_idx, :, :],
-            style="dark_background",
-        )
-        fig.suptitle(f"Token #{token_idx}: `{text_tokens[token_idx]}`", color="white", fontsize=14)
-        savepath = savedir / f"token_{token_idx}.png"
-        fig.savefig(savepath)
-        print(f"Saved attention map for token {token_idx} (`{text_tokens[token_idx]}`) to `{savepath}`.\n")
-        plt.close(fig)
-    return

colpali_engine/interpretability/plot_utils.py DELETED Viewed

@@ -1,131 +0,0 @@
-from typing import Any, Dict, Optional, Tuple, cast
-import matplotlib.pyplot as plt
-import numpy as np
-import numpy.typing as npt
-import seaborn as sns
-import torch
-from PIL import Image
-MAX_OPACITY = 255
-def plot_patches(
-    img: Image.Image,
-    patch_size: int,
-    image_resolution: int,
-    patch_opacities: Optional[npt.NDArray | torch.Tensor] = None,
-    figsize: Tuple[int, int] = (8, 8),
-    style: Dict[str, Any] | str | None = None,
-) -> Tuple[plt.Figure, plt.Axes]:
-    """
-    Plot patches of a square image.
-    Set `style` to "dark_background" if your image has a light background.
-    """
-    # Get the number of patches
-    if image_resolution % patch_size != 0:
-        raise ValueError("The image resolution must be divisible by the patch size.")
-    num_patches = image_resolution // patch_size
-    # Default style
-    if style is None:
-        style = {}
-    # Sanity checks
-    if patch_opacities is not None:
-        if isinstance(patch_opacities, torch.Tensor):
-            patch_opacities = cast(npt.NDArray, patch_opacities.cpu().numpy())
-        if patch_opacities.shape != (num_patches, num_patches):
-            raise ValueError("The shape of the patch_opacities tensor is not correct.")
-        if not np.all((0 <= patch_opacities) & (patch_opacities <= 1)):
-            raise ValueError("The patch_opacities tensor must have values between 0 and 1.")
-    # If the image is not square, raise an error
-    if img.size[0] != img.size[1]:
-        raise ValueError("The image must be square.")
-    # Get the image as a numpy array
-    img_array = np.array(img.convert("RGBA"))  # (H, W, C) where the last channel is the alpha channel
-    # Create a figure
-    with plt.style.context(style):
-        fig, axis = plt.subplots(num_patches, num_patches, figsize=figsize)
-        # Plot the patches
-        for i in range(num_patches):
-            for j in range(num_patches):
-                patch = img_array[i * patch_size : (i + 1) * patch_size, j * patch_size : (j + 1) * patch_size, :]
-                # Set the opacity of the patch
-                if patch_opacities is not None:
-                    patch[:, :, -1] = round(patch_opacities[i, j] * MAX_OPACITY)
-                axis[i, j].imshow(patch)
-                axis[i, j].axis("off")
-        fig.subplots_adjust(wspace=0.1, hspace=0.1)
-    fig.tight_layout()
-    return fig, axis
-def plot_attention_heatmap(
-    img: Image.Image,
-    patch_size: int,
-    image_resolution: int,
-    attention_map: npt.NDArray | torch.Tensor,
-    figsize: Tuple[int, int] = (8, 8),
-    style: Dict[str, Any] | str | None = None,
-    show_colorbar: bool = False,
-    show_axes: bool = False,
-) -> Tuple[plt.Figure, plt.Axes]:
-    """
-    Plot a heatmap of the attention map over the image.
-    The image must be square and `attention_map` must be normalized between 0 and 1.
-    """
-    # Get the number of patches
-    if image_resolution % patch_size != 0:
-        raise ValueError("The image resolution must be divisible by the patch size.")
-    num_patches = image_resolution // patch_size
-    # Default style
-    if style is None:
-        style = {}
-    # Sanity checks
-    if isinstance(attention_map, torch.Tensor):
-        attention_map = cast(npt.NDArray, attention_map.cpu().numpy())
-    if attention_map.shape != (num_patches, num_patches):
-        raise ValueError("The shape of the patch_opacities tensor is not correct.")
-    if not np.all((0 <= attention_map) & (attention_map <= 1)):
-        raise ValueError("The patch_opacities tensor must have values between 0 and 1.")
-    # If the image is not square, raise an error
-    if img.size[0] != img.size[1]:
-        raise ValueError("The image must be square.")
-    # Get the image as a numpy array
-    img_array = np.array(img.convert("RGBA"))  # (H, W, C) where the last channel is the alpha channel
-    # Get the attention map as a numpy array
-    attention_map_image = Image.fromarray((attention_map * 255).astype("uint8")).resize(
-        img.size, Image.Resampling.BICUBIC
-    )
-    # Create a figure
-    with plt.style.context(style):
-        fig, ax = plt.subplots(figsize=figsize)
-        ax.imshow(img_array)
-        im = ax.imshow(
-            attention_map_image,
-            cmap=sns.color_palette("mako", as_cmap=True),
-            alpha=0.5,
-        )
-        if show_colorbar:
-            fig.colorbar(im)
-        if not show_axes:
-            ax.set_axis_off()
-        fig.tight_layout()
-    return fig, ax

colpali_engine/interpretability/processor.py DELETED Viewed

@@ -1,116 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import List, cast
-import torch
-from PIL import Image
-from transformers import LlamaTokenizerFast, PaliGemmaProcessor
-@dataclass
-class ColPaliTextInput:
-    input_ids: torch.Tensor
-    attention_mask: torch.Tensor
-    def to(self, device: torch.device) -> ColPaliTextInput:
-        return ColPaliTextInput(
-            input_ids=self.input_ids.to(device),
-            attention_mask=self.attention_mask.to(device),
-        )
-@dataclass
-class ColPaliImageInput:
-    input_ids: torch.Tensor
-    pixel_values: torch.Tensor
-    attention_mask: torch.Tensor
-    def to(self, device: str | torch.device) -> ColPaliImageInput:
-        return ColPaliImageInput(
-            input_ids=self.input_ids.to(device),
-            pixel_values=self.pixel_values.to(device),
-            attention_mask=self.attention_mask.to(device),
-        )
-class ColPaliProcessor:
-    def __init__(self, processor: PaliGemmaProcessor):
-        self.processor = processor
-        self.tokenizer = cast(LlamaTokenizerFast, self.processor.tokenizer)  # type: ignore
-    @staticmethod
-    def from_pretrained(model_name: str) -> ColPaliProcessor:
-        return ColPaliProcessor(processor=cast(PaliGemmaProcessor, PaliGemmaProcessor.from_pretrained(model_name)))
-    def process_text(
-        self,
-        text: str | List[str],
-        padding: str = "longest",
-        return_tensors: str = "pt",
-        add_special_tokens: bool = True,
-    ) -> ColPaliTextInput:
-        """
-        Process text inputs for the model.
-        If `add_special_tokens` is True (default), the text will be prepended with the <bos> token and appended with " \n".
-        """
-        if add_special_tokens:
-            if isinstance(text, str):
-                text = self.tokenizer.bos_token + text + "\n"
-            elif isinstance(text, list):
-                text = [self.tokenizer.bos_token + t + "\n" for t in text]
-            else:
-                raise ValueError("text must be a string or a list of strings.")
-        batch_output = self.tokenizer(
-            text, padding=padding, return_tensors=return_tensors, add_special_tokens=add_special_tokens
-        )
-        return ColPaliTextInput(
-            input_ids=cast(torch.Tensor, batch_output["input_ids"]),
-            attention_mask=cast(torch.Tensor, batch_output["attention_mask"]),
-        )
-    def process_image(
-        self,
-        image: Image.Image | List[Image.Image],
-        padding: str = "longest",
-        do_convert_rgb: bool = True,
-        return_tensors: str = "pt",
-        add_special_prompt: bool = True,
-    ) -> ColPaliImageInput:
-        # NOTE: The special prompt was used at training time,
-        special_prompt = "Describe the image." if add_special_prompt else None
-        if isinstance(image, Image.Image):
-            text_input = [special_prompt]
-        elif isinstance(image, list):
-            text_input = [special_prompt] * len(image)
-        else:
-            raise ValueError("image must be a PIL Image or a list of PIL Images.")
-        batch_output = self.processor(
-            text=text_input,
-            images=image,
-            padding=padding,
-            do_convert_rgb=do_convert_rgb,
-            return_tensors=return_tensors,
-        )
-        if add_special_prompt:
-            return ColPaliImageInput(
-                input_ids=batch_output["input_ids"],
-                pixel_values=batch_output["pixel_values"],
-                attention_mask=batch_output["attention_mask"],
-            )
-        else:
-            return ColPaliImageInput(
-                input_ids=batch_output["input_ids"][:, : self.processor.image_seq_length],
-                pixel_values=batch_output["pixel_values"][:, : self.processor.image_seq_length],
-                attention_mask=batch_output["attention_mask"][:, : self.processor.image_seq_length],
-            )
-    def decode(self, *args, **kwargs):
-        return self.tokenizer.decode(*args, **kwargs)
-    def batch_decode(self, *args, **kwargs):
-        return self.tokenizer.batch_decode(*args, **kwargs)

colpali_engine/interpretability/torch_utils.py DELETED Viewed

@@ -1,60 +0,0 @@
-import logging
-import torch
-logger = logging.getLogger(__name__)
-EPSILON = 1e-10
-def normalize_attention_map_per_query_token(x: torch.Tensor) -> torch.Tensor:
-    """
-    Normalizes the attention map for ColPali for each query token.
-    The output tensor will have values in the range [0, 1] and the
-    same shape as the input tensor.
-    Args:
-        x: The attention map tensor of shape (batch_size, n_text_tokens, n_patch_x, n_patch_y).
-    """
-    if x.ndim != 4:
-        raise ValueError("The input tensor must have 4 dimensions.")
-    # Compute the minimum values along the last two dimensions (n_patch_x, n_patch_y)
-    min_vals = x.min(dim=-1, keepdim=True)[0].min(dim=-2, keepdim=True)[0]
-    # Compute the maximum values along the last two dimensions (n_patch_x, n_patch_y)
-    max_vals = x.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0]
-    # Normalize the tensor
-    x_normalized = (x - min_vals) / (max_vals - min_vals + EPSILON)  # Adding a small epsilon to avoid division by zero
-    return x_normalized
-def normalize_attention_map_per_query(x: torch.Tensor) -> torch.Tensor:
-    """
-    Normalizes the attention map for ColPali for each query token.
-    The output tensor will have values in the range [0, 1] and the
-    same shape as the input tensor.
-    Args:
-        x: The attention map tensor of shape (batch_size, n_text_tokens, n_patch_x, n_patch_y).
-    """
-    # Log warning
-    logger.warning(
-        "This function should not be used for ColPali because it doesn't make sense to normalize the attention map across the text tokens."
-    )
-    if x.ndim != 4:
-        raise ValueError("The input tensor must have 4 dimensions.")
-    # Compute the minimum values along the last three dimensions (n_text_tokens, n_patch_x, n_patch_y)
-    min_vals = x.min(dim=-1, keepdim=True)[0].min(dim=-2, keepdim=True)[0].min(dim=-3, keepdim=True)[0]
-    # Compute the maximum values along the last three dimensions (n_text_tokens, n_patch_x, n_patch_y)
-    max_vals = x.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0].max(dim=-3, keepdim=True)[0]
-    # Normalize the tensor
-    x_normalized = (x - min_vals) / (max_vals - min_vals + EPSILON)  # Adding a small epsilon to avoid division by zero
-    return x_normalized

colpali_engine/interpretability/vit_configs.py DELETED Viewed

@@ -1,23 +0,0 @@
-from dataclasses import dataclass
-from typing import Dict
-@dataclass
-class ViTConfig:
-    patch_size: int
-    resolution: int
-    @property
-    def n_patch_per_dim(self) -> int:
-        if self.resolution % self.patch_size != 0:
-            raise ValueError(f"Resolution {self.resolution} is not divisible by patch size {self.patch_size}")
-        return self.resolution // self.patch_size
-VIT_CONFIG: Dict[str, ViTConfig] = {
-    "google/siglip-so400m-patch14-384": ViTConfig(patch_size=14, resolution=384),
-    "timm/ViT-SO400M-14-SigLIP-384": ViTConfig(patch_size=14, resolution=384),
-    "google/paligemma-3b-mix-448": ViTConfig(
-        patch_size=14, resolution=448
-    ),  # based on "timm/ViT-SO400M-14-SigLIP-384" with increased resolution
-}

colpali_engine/loss/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .colbert_loss import ColbertLoss

colpali_engine/loss/colbert_loss.py DELETED Viewed

@@ -1,122 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss
-class BiEncoderLoss(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.ce_loss = CrossEntropyLoss()
-        # self.pooling_strategy = pooling_strategy
-    def forward(self, query_embeddings, doc_embeddings):
-        """
-        query_embeddings: (batch_size, dim)
-        doc_embeddings: (batch_size, dim)
-        """
-        scores = torch.einsum("bd,cd->bc", query_embeddings, doc_embeddings)
-        loss_rowwise = self.ce_loss(scores, torch.arange(scores.shape[0], device=scores.device))
-        # loss_columnwise = self.ce_loss(scores.T, torch.arange(scores.shape[1], device=scores.device))
-        # loss = (loss_rowwise + loss_columnwise) / 2
-        return loss_rowwise
-class ColbertLoss(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.ce_loss = CrossEntropyLoss()
-    def forward(self, query_embeddings, doc_embeddings):
-        """
-        query_embeddings: (batch_size, num_query_tokens, dim)
-        doc_embeddings: (batch_size, num_doc_tokens, dim)
-        """
-        scores = torch.einsum("bnd,csd->bcns", query_embeddings, doc_embeddings).max(dim=3)[0].sum(dim=2)
-        # scores = torch.zeros((query_embeddings.shape[0], doc_embeddings.shape[0]), device=query_embeddings.device)
-        # for i in range(query_embeddings.shape[0]):
-        #     for j in range(doc_embeddings.shape[0]):
-        #         # step 1 - dot product --> (s1,s2)
-        #         q2d_scores = torch.matmul(query_embeddings[i], doc_embeddings[j].T)
-        #         # step 2 -> max on doc  --> (s1)
-        #         q_scores = torch.max(q2d_scores, dim=1)[0]
-        #         # step 3 --> sum the max score --> (1)
-        #         sum_q_score = torch.sum(q_scores)
-        #         # step 4 --> assert is scalar
-        #         scores[i, j] = sum_q_score
-        # assert (scores_einsum - scores < 0.0001).all().item()
-        loss_rowwise = self.ce_loss(scores, torch.arange(scores.shape[0], device=scores.device))
-        # TODO: comparing between queries might not make sense since it's a sum over the length of the query
-        # loss_columnwise = self.ce_loss(scores.T, torch.arange(scores.shape[1], device=scores.device))
-        # loss = (loss_rowwise + loss_columnwise) / 2
-        return loss_rowwise
-class ColbertPairwiseCELoss(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.ce_loss = CrossEntropyLoss()
-    def forward(self, query_embeddings, doc_embeddings):
-        """
-        query_embeddings: (batch_size, num_query_tokens, dim)
-        doc_embeddings: (batch_size, num_doc_tokens, dim)
-        Positive scores are the diagonal of the scores matrix.
-        """
-        # Compute the ColBERT scores
-        scores = (
-            torch.einsum("bnd,csd->bcns", query_embeddings, doc_embeddings).max(dim=3)[0].sum(dim=2)
-        )  # (batch_size, batch_size)
-        # Positive scores are the diagonal of the scores matrix.
-        pos_scores = scores.diagonal()  # (batch_size,)
-        # Negative score for a given query is the maximum of the scores against all all other pages.
-        # NOTE: We exclude the diagonal by setting it to a very low value: since we know the maximum score is 1,
-        # we can subtract 1 from the diagonal to exclude it from the maximum operation.
-        neg_scores = scores - torch.eye(scores.shape[0], device=scores.device) * 1e6  # (batch_size, batch_size)
-        neg_scores = neg_scores.max(dim=1)[0]  # (batch_size,)
-        # Compute the loss
-        # The loss is computed as the negative log of the softmax of the positive scores
-        # relative to the negative scores.
-        # This can be simplified to log-sum-exp of negative scores minus the positive score
-        # for numerical stability.
-        # torch.vstack((pos_scores, neg_scores)).T.softmax(1)[:, 0].log()*(-1)
-        loss = F.softplus(neg_scores - pos_scores).mean()
-        return loss
-class BiPairwiseCELoss(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.ce_loss = CrossEntropyLoss()
-    def forward(self, query_embeddings, doc_embeddings):
-        """
-        query_embeddings: (batch_size, dim)
-        doc_embeddings: (batch_size, dim)
-        """
-        scores = torch.einsum("bd,cd->bc", query_embeddings, doc_embeddings)
-        pos_scores = scores.diagonal()
-        neg_scores = scores - torch.eye(scores.shape[0], device=scores.device) * 1e6
-        neg_scores = neg_scores.max(dim=1)[0]
-        # Compute the loss
-        # The loss is computed as the negative log of the softmax of the positive scores
-        # relative to the negative scores.
-        # This can be simplified to log-sum-exp of negative scores minus the positive score
-        # for numerical stability.
-        loss = F.softplus(neg_scores - pos_scores).mean()
-        return loss

colpali_engine/models/__init__.py DELETED Viewed

File without changes

colpali_engine/models/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (170 Bytes)

colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc DELETED Viewed

Binary file (4.87 kB)

colpali_engine/models/clip_baselines.py DELETED Viewed

@@ -1,144 +0,0 @@
-import os
-from typing import Optional
-import torch
-from transformers import SiglipModel
-class SigLIP(SiglipModel):
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        return self.forward_branch(*args, **kwargs)
-    def forward_branch(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if pixel_values is not None:
-            # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
-            outputs = self.vision_model(
-                pixel_values=pixel_values.to(dtype=self.dtype),
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                interpolate_pos_encoding=interpolate_pos_encoding,
-            )
-        else:
-            outputs = self.text_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        embeds = outputs[1]
-        # normalized features
-        embeds = embeds / embeds.norm(p=2, dim=-1, keepdim=True)
-        return embeds
-class ColSigLIP(SiglipModel):
-    def __init__(self, config):
-        super(ColSigLIP, self).__init__(config=config)
-        self.dim = 128
-        self.custom_vision_proj = torch.nn.Linear(self.config.vision_config.hidden_size, self.dim)
-        self.custom_text_proj = torch.nn.Linear(self.config.text_config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        return self.forward_branch(*args, **kwargs)
-    def forward_branch(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        return_loss: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        interpolate_pos_encoding: bool = False,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if pixel_values is not None:
-            # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
-            outputs = self.vision_model(
-                pixel_values=pixel_values.to(dtype=self.dtype),
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-                interpolate_pos_encoding=interpolate_pos_encoding,
-            )
-            last_hidden_states = outputs.last_hidden_state
-            proj = self.custom_vision_proj(last_hidden_states)
-            # normalize l2 norm
-            proj = proj / proj.norm(dim=-1, keepdim=True)
-        else:
-            outputs = self.text_model(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-            last_hidden_states = outputs.last_hidden_state
-            proj = self.custom_text_proj(last_hidden_states)
-            # normalize l2 norm
-            proj = proj / proj.norm(dim=-1, keepdim=True)
-            proj = proj * attention_mask.unsqueeze(-1)
-        # normalized features
-        return proj

colpali_engine/models/colbert_architectures.py DELETED Viewed

@@ -1,177 +0,0 @@
-from torch import nn
-from transformers import (
-    BertModel,
-    BertPreTrainedModel,
-    CamembertModel,
-    CamembertPreTrainedModel,
-    LlamaModel,
-    LlamaPreTrainedModel,
-    XLMRobertaModel,
-    XLMRobertaPreTrainedModel,
-)
-class ColCamembert(CamembertPreTrainedModel):
-    def __init__(self, config):
-        super(ColCamembert, self).__init__(config=config)
-        self.roberta: CamembertPreTrainedModel = CamembertModel(config)
-        self.dim = 128
-        self.linear = nn.Linear(self.roberta.config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Camenbert and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.roberta(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        proj = self.linear(last_hidden_states)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj
-class ColXLMRoBERTa(XLMRobertaPreTrainedModel):
-    def __init__(self, config):
-        super(ColXLMRoBERTa, self).__init__(config=config)
-        self.roberta: XLMRobertaPreTrainedModel = XLMRobertaModel(config)
-        self.dim = 128
-        self.linear = nn.Linear(self.roberta.config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Roberta and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.roberta(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        proj = self.linear(last_hidden_states)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj
-class BiXLMRoBERTa(XLMRobertaPreTrainedModel):
-    def __init__(self, config):
-        super(BiXLMRoBERTa, self).__init__(config=config)
-        self.roberta: XLMRobertaPreTrainedModel = XLMRobertaModel(config)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Roberta and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.roberta(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        # pooling - mean tokens that have attention mask == 1
-        proj = last_hidden_states * kwargs["attention_mask"].unsqueeze(-1)
-        proj = proj.sum(dim=1) / kwargs["attention_mask"].sum(dim=1, keepdim=True)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        return proj
-class ColBERT(BertPreTrainedModel):
-    def __init__(self, config):
-        super(ColBERT, self).__init__(config=config)
-        self.bert: BertModel = BertModel(config)
-        self.dim = 128
-        self.linear = nn.Linear(self.bert.config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through BERT and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.bert(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        proj = self.linear(last_hidden_states)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj
-class BiBERT(BertPreTrainedModel):
-    def __init__(self, config):
-        super(BiBERT, self).__init__(config=config)
-        self.bert: BertModel = BertModel(config)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through BERT and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.bert(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        # pooling - mean tokens that have attention mask == 1
-        proj = last_hidden_states * kwargs["attention_mask"].unsqueeze(-1)
-        proj = proj.sum(dim=1) / kwargs["attention_mask"].sum(dim=1, keepdim=True)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        return proj
-class ColLlama(LlamaPreTrainedModel):
-    def __init__(self, config):
-        super(ColLlama, self).__init__(config=config)
-        self.model: LlamaModel = LlamaModel(config)
-        self.dim = 128
-        self.linear = nn.Linear(self.model.config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.model(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        proj = self.linear(last_hidden_states)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj

colpali_engine/models/idefics_colbert_architecture.py DELETED Viewed

@@ -1,57 +0,0 @@
-from torch import nn
-from transformers import Idefics2Model, Idefics2PreTrainedModel
-class BiIdefics(Idefics2PreTrainedModel):
-    def __init__(self, config):
-        super(BiIdefics, self).__init__(config=config)
-        self.model: Idefics2Model = Idefics2Model(config)
-        self.pooling_strategy = "last"
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.model(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        # pooling - last token
-        proj = last_hidden_states[:, -1, :]
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        return proj
-class ColIdefics(Idefics2PreTrainedModel):
-    def __init__(self, config):
-        super(ColIdefics, self).__init__(config=config)
-        self.model: Idefics2Model = Idefics2Model(config)
-        self.dim = 128
-        self.linear = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.model(*args, **kwargs)
-        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
-        proj = self.linear(last_hidden_states)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj

colpali_engine/models/paligemma_colbert_architecture.py DELETED Viewed

@@ -1,191 +0,0 @@
-import torch
-from torch import nn
-from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration, PaliGemmaPreTrainedModel
-class BiPaliLast(PaliGemmaPreTrainedModel):
-    def __init__(self, config):
-        super(BiPaliLast, self).__init__(config=config)
-        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
-        self.pooling_strategy = "last"
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.model(*args, output_hidden_states=True, **kwargs)
-        last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
-        # pooling - last token
-        proj = last_hidden_states[:, -1, :]
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        return proj
-class BiPaliMean(PaliGemmaPreTrainedModel):
-    def __init__(self, config):
-        super(BiPaliMean, self).__init__(config=config)
-        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
-        self.pooling_strategy = "mean"
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.model(*args, output_hidden_states=True, **kwargs)
-        last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
-        # pooling -mean on attention mask==1
-        proj = torch.sum(last_hidden_states * kwargs["attention_mask"].unsqueeze(-1), dim=1) / torch.sum(
-            kwargs["attention_mask"], dim=1, keepdim=True
-        )
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        return proj
-class ColPali(PaliGemmaPreTrainedModel):
-    def __init__(self, config):
-        super(ColPali, self).__init__(config=config)
-        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
-        self.dim = 128
-        self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        outputs = self.model(*args, output_hidden_states=True, **kwargs)
-        last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
-        proj = self.custom_text_proj(last_hidden_states)
-        # normalize l2 norm
-        proj = proj / proj.norm(dim=-1, keepdim=True)
-        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj
-class ColNewSiglip(PaliGemmaPreTrainedModel):
-    def __init__(self, config):
-        super(ColNewSiglip, self).__init__(config=config)
-        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
-        self.dim = 128
-        self.custom_image_proj = nn.Linear(self.model.config.vision_config.projection_dim, self.dim)
-        self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        # outputs = self.model(*args, output_hidden_states=True, **kwargs)
-        if "pixel_values" in kwargs:
-            image_features = self.vision_model_output(*args, **kwargs)
-            # print(f"Doc: {image_features.shape}")
-            proj = self.custom_image_proj(image_features)
-            # print(f"Doc proj: {proj.shape}")
-            proj = proj / proj.norm(dim=-1, keepdim=True)
-        else:
-            outputs = self.model(*args, output_hidden_states=True, **kwargs)
-            last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
-            # print(f"Query: {last_hidden_states.shape}")
-            proj = self.custom_text_proj(last_hidden_states)
-            # print(f"Query proj: {proj.shape}")
-            # normalize l2 norm
-            proj = proj / proj.norm(dim=-1, keepdim=True)
-            proj = proj * kwargs["attention_mask"].unsqueeze(-1)
-        return proj
-    def vision_model_output(self, input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, **kwargs):
-        inputs_embeds = self.model.get_input_embeddings()(input_ids)
-        # 2. Merge text and images
-        if pixel_values is not None and input_ids.shape[1] != 1:
-            image_outputs = self.model.vision_tower(pixel_values.to(inputs_embeds.dtype))
-            selected_image_feature = image_outputs.last_hidden_state
-            image_features = self.model.multi_modal_projector(selected_image_feature)
-            return image_features
-        raise ValueError("pixel_values is None or input_ids.shape[1] == 1")
-class BiNewSiglip(PaliGemmaPreTrainedModel):
-    def __init__(self, config):
-        super(BiNewSiglip, self).__init__(config=config)
-        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
-        self.main_input_name = "doc_input_ids"
-    def forward(self, *args, **kwargs):
-        """
-        Forward pass through Llama and the linear layer for dimensionality reduction
-        Args:
-        - input_ids (torch.LongTensor): The input tokens tensor.
-        - attention_mask (torch.LongTensor): The attention mask tensor.
-        Returns:
-        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
-        """
-        # outputs = self.model(*args, output_hidden_states=True, **kwargs)
-        if "pixel_values" in kwargs:
-            image_features = self.vision_model_output(*args, **kwargs)
-            # print(f"Doc: {image_features.shape}")
-            # pool image features
-            proj = torch.mean(image_features, dim=1)
-            # print(f"Doc proj: {proj.shape}")
-            norm = proj.norm(dim=-1, keepdim=True)
-            proj = proj / norm
-        else:
-            outputs = self.model(*args, output_hidden_states=True, **kwargs)
-            last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
-            # pooling -mean on attention mask==1
-            proj = torch.sum(last_hidden_states * kwargs["attention_mask"].unsqueeze(-1), dim=1) / torch.sum(
-                kwargs["attention_mask"], dim=1, keepdim=True
-            )
-            # print(f"Query proj: {proj.shape}")
-            norm = proj.norm(dim=-1, keepdim=True)
-            proj = proj / norm
-        return proj
-    def vision_model_output(self, input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, **kwargs):
-        inputs_embeds = self.model.get_input_embeddings()(input_ids)
-        # 2. Merge text and images
-        if pixel_values is not None and input_ids.shape[1] != 1:
-            image_outputs = self.model.vision_tower(pixel_values.to(inputs_embeds.dtype))
-            selected_image_feature = image_outputs.last_hidden_state
-            image_features = self.model.multi_modal_projector(selected_image_feature)
-            return image_features
-        raise ValueError("pixel_values is None or input_ids.shape[1] == 1")

colpali_engine/trainer/__init__.py DELETED Viewed

File without changes

colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (171 Bytes)

colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc DELETED Viewed

Binary file (3.18 kB)

colpali_engine/trainer/contrastive_trainer.py DELETED Viewed

@@ -1,64 +0,0 @@
-import torch
-from transformers import Trainer
-class ContrastiveTrainer(Trainer):
-    def __init__(self, loss_func, is_vision_model, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.loss_func = loss_func
-        self.is_vision_model = is_vision_model
-    def compute_loss(self, model, inputs, return_outputs=False):
-        query_outputs = model(input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"])
-        if self.is_vision_model:
-            if "doc_pixel_attention_mask" not in inputs:
-                doc_outputs = model(
-                    input_ids=inputs["doc_input_ids"],
-                    attention_mask=inputs["doc_attention_mask"],
-                    pixel_values=inputs["doc_pixel_values"],
-                )
-            else:
-                doc_outputs = model(
-                    input_ids=inputs["doc_input_ids"],
-                    attention_mask=inputs["doc_attention_mask"],
-                    pixel_values=inputs["doc_pixel_values"],
-                    pixel_attention_mask=inputs["doc_pixel_attention_mask"],
-                )
-        else:
-            doc_outputs = model(input_ids=inputs["doc_input_ids"], attention_mask=inputs["doc_attention_mask"])
-        loss = self.loss_func(query_outputs, doc_outputs)
-        return (loss, (query_outputs, doc_outputs)) if return_outputs else loss
-    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=True):
-        """This function is used to generate predictions and return the loss for the given inputs."""
-        if not prediction_loss_only:
-            raise ValueError("prediction_step is only called with prediction_loss_only=True")
-        with torch.no_grad():
-            if self.is_vision_model:
-                if "doc_pixel_attention_mask" not in inputs:
-                    doc_outputs = model(
-                        input_ids=inputs["doc_input_ids"],
-                        attention_mask=inputs["doc_attention_mask"],
-                        pixel_values=inputs["doc_pixel_values"],
-                    )
-                else:
-                    doc_outputs = model(
-                        input_ids=inputs["doc_input_ids"],
-                        attention_mask=inputs["doc_attention_mask"],
-                        pixel_values=inputs["doc_pixel_values"],
-                        pixel_attention_mask=inputs["doc_pixel_attention_mask"],
-                    )
-                query_outputs = model(
-                    input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"]
-                )
-            else:
-                query_outputs = model(
-                    input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"]
-                )
-                doc_outputs = model(input_ids=inputs["doc_input_ids"], attention_mask=inputs["doc_attention_mask"])
-            loss = self.loss_func(query_outputs, doc_outputs)
-            return loss, None, None

colpali_engine/trainer/retrieval_evaluator.py DELETED Viewed

@@ -1,72 +0,0 @@
-import torch
-from mteb.evaluation.evaluators import RetrievalEvaluator
-class CustomEvaluator:
-    def __init__(self, is_multi_vector=False):
-        self.is_multi_vector = is_multi_vector
-        self.mteb_evaluator = RetrievalEvaluator()
-    def evaluate(self, qs, ps):
-        if self.is_multi_vector:
-            scores = self.evaluate_colbert(qs, ps)
-        else:
-            scores = self.evaluate_biencoder(qs, ps)
-        assert scores.shape[0] == len(qs)
-        arg_score = scores.argmax(dim=1)
-        # compare to arange
-        accuracy = (arg_score == torch.arange(scores.shape[0], device=scores.device)).sum().item() / scores.shape[0]
-        print(arg_score)
-        print(f"Top 1 Accuracy (verif): {accuracy}")
-        # cast to numpy
-        # scores = scores.cpu().numpy()
-        scores = scores.to(torch.float32).cpu().numpy()
-        return scores
-    def compute_metrics(self, relevant_docs, results, **kwargs):
-        # wrap mteb package
-        ndcg, _map, recall, precision, naucs = self.mteb_evaluator.evaluate(
-            relevant_docs,
-            results,
-            self.mteb_evaluator.k_values,
-            ignore_identical_ids=kwargs.get("ignore_identical_ids", True),
-        )
-        mrr = self.mteb_evaluator.evaluate_custom(relevant_docs, results, self.mteb_evaluator.k_values, "mrr")
-        scores = {
-            **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
-            **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
-            **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()},
-            **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()},
-            **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr[0].items()},
-            **{f"naucs_at_{k.split('@')[1]}": v for (k, v) in naucs.items()},
-        }
-        return scores
-    def evaluate_colbert(self, qs, ps, batch_size=128) -> torch.Tensor:
-        scores = []
-        for i in range(0, len(qs), batch_size):
-            scores_batch = []
-            qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
-                "cpu"
-            )
-            for j in range(0, len(ps), batch_size):
-                ps_batch = torch.nn.utils.rnn.pad_sequence(
-                    ps[j : j + batch_size], batch_first=True, padding_value=0
-                ).to("cpu")
-                scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
-            scores_batch = torch.cat(scores_batch, dim=1).cpu()
-            scores.append(scores_batch)
-        scores = torch.cat(scores, dim=0)
-        return scores
-    def evaluate_biencoder(self, qs, ps) -> torch.Tensor:
-        qs = torch.stack(qs)
-        ps = torch.stack(ps)
-        scores = torch.einsum("bd,cd->bc", qs, ps)
-        return scores

colpali_engine/utils/__init__.py DELETED Viewed

File without changes

colpali_engine/utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (169 Bytes)

colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc DELETED Viewed

Binary file (1.2 kB)

colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc DELETED Viewed

Binary file (998 Bytes)

colpali_engine/utils/colidefics_processing_utils.py DELETED Viewed

@@ -1,53 +0,0 @@
-# Utils for processing images and queries for ColPaLi
-def process_images(processor, images, max_length: int = 50):
-    texts_doc = []
-    images = [image.convert("RGB") for image in images]
-    for _ in images:
-        messages_doc = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "Describe the image."},
-                    {"type": "image"},
-                ],
-            },
-        ]
-        text_doc = processor.apply_chat_template(messages_doc, add_generation_prompt=False)
-        texts_doc.append(text_doc.strip())
-    batch_doc = processor(
-        text=texts_doc,
-        images=images,
-        return_tensors="pt",
-        padding="longest",
-    )
-    return batch_doc
-def process_queries(processor, queries, mock_image, max_length: int = 50):
-    texts_query = []
-    for query in queries:
-        messages_query = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"Question: {query}<end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance>",
-                    },
-                ],
-            },
-        ]
-        text_query = processor.apply_chat_template(messages_query, add_generation_prompt=False).strip()
-        texts_query.append(text_query)
-    batch_query = processor(
-        text=texts_query,
-        return_tensors="pt",
-        padding="longest",
-        max_length=max_length,
-    )
-    return batch_query

colpali_engine/utils/colpali_processing_utils.py DELETED Viewed

@@ -1,36 +0,0 @@
-# Utils for processing images and queries for ColPaLi
-def process_images(processor, images, max_length: int = 50):
-    texts_doc = ["Describe the image."] * len(images)
-    images = [image.convert("RGB") for image in images]
-    batch_doc = processor(
-        text=texts_doc,
-        images=images,
-        return_tensors="pt",
-        padding="longest",
-        max_length=max_length + processor.image_seq_length,
-    )
-    return batch_doc
-def process_queries(processor, queries, mock_image, max_length: int = 50):
-    texts_query = []
-    for query in queries:
-        query = f"Question: {query}<unused0><unused0><unused0><unused0><unused0>"
-        texts_query.append(query)
-    batch_query = processor(
-        images=[mock_image.convert("RGB")] * len(texts_query),
-        # NOTE: the image is not used in batch_query but it is required for calling the processor
-        text=texts_query,
-        return_tensors="pt",
-        padding="longest",
-        max_length=max_length + processor.image_seq_length,
-    )
-    del batch_query["pixel_values"]
-    batch_query["input_ids"] = batch_query["input_ids"][..., processor.image_seq_length :]
-    batch_query["attention_mask"] = batch_query["attention_mask"][..., processor.image_seq_length :]
-    return batch_query

colpali_engine/utils/dataset_transformation.py DELETED Viewed

@@ -1,158 +0,0 @@
-import os
-from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
-USE_LOCAL_DATASET = os.environ.get("USE_LOCAL_DATASET", "1") == "1"
-def add_metadata_column(dataset, column_name, value):
-    def add_source(example):
-        example[column_name] = value
-        return example
-    return dataset.map(add_source)
-def load_train_set() -> DatasetDict:
-    ds_paths = [
-        "infovqa_train",
-        "docvqa_train",
-        "arxivqa_train",
-        "tatdqa_train",
-        "syntheticDocQA_government_reports_train",
-        "syntheticDocQA_healthcare_industry_train",
-        "syntheticDocQA_artificial_intelligence_train",
-        "syntheticDocQA_energy_train",
-    ]
-    base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/"
-    ds_tot = []
-    for path in ds_paths:
-        cpath = base_path + path
-        ds = load_dataset(cpath, split="train")
-        if "arxivqa" in path:
-            # subsample 10k
-            ds = ds.shuffle(42).select(range(10000))
-        ds_tot.append(ds)
-    dataset = concatenate_datasets(ds_tot)
-    dataset = dataset.shuffle(seed=42)
-    # split into train and test
-    dataset_eval = dataset.select(range(500))
-    dataset = dataset.select(range(500, len(dataset)))
-    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
-    return ds_dict
-def load_train_set_with_tabfquad() -> DatasetDict:
-    ds_paths = [
-        "infovqa_train",
-        "docvqa_train",
-        "arxivqa_train",
-        "tatdqa_train",
-        "tabfquad_train_subsampled",
-        "syntheticDocQA_government_reports_train",
-        "syntheticDocQA_healthcare_industry_train",
-        "syntheticDocQA_artificial_intelligence_train",
-        "syntheticDocQA_energy_train",
-    ]
-    base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/"
-    ds_tot = []
-    for path in ds_paths:
-        cpath = base_path + path
-        ds = load_dataset(cpath, split="train")
-        if "arxivqa" in path:
-            # subsample 10k
-            ds = ds.shuffle(42).select(range(10000))
-        ds_tot.append(ds)
-    dataset = concatenate_datasets(ds_tot)
-    dataset = dataset.shuffle(seed=42)
-    # split into train and test
-    dataset_eval = dataset.select(range(500))
-    dataset = dataset.select(range(500, len(dataset)))
-    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
-    return ds_dict
-def load_train_set_with_docmatix() -> DatasetDict:
-    ds_paths = [
-        "infovqa_train",
-        "docvqa_train",
-        "arxivqa_train",
-        "tatdqa_train",
-        "tabfquad_train_subsampled",
-        "syntheticDocQA_government_reports_train",
-        "syntheticDocQA_healthcare_industry_train",
-        "syntheticDocQA_artificial_intelligence_train",
-        "syntheticDocQA_energy_train",
-        "Docmatix_filtered_train",
-    ]
-    base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/"
-    ds_tot = []
-    for path in ds_paths:
-        cpath = base_path + path
-        ds = load_dataset(cpath, split="train")
-        if "arxivqa" in path:
-            # subsample 10k
-            ds = ds.shuffle(42).select(range(10000))
-        ds_tot.append(ds)
-    dataset = concatenate_datasets(ds_tot)
-    dataset = dataset.shuffle(seed=42)
-    # split into train and test
-    dataset_eval = dataset.select(range(500))
-    dataset = dataset.select(range(500, len(dataset)))
-    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
-    return ds_dict
-def load_docvqa_dataset() -> DatasetDict:
-    if USE_LOCAL_DATASET:
-        dataset_doc = load_dataset("./data_dir/DocVQA", "DocVQA", split="validation")
-        dataset_doc_eval = load_dataset("./data_dir/DocVQA", "DocVQA", split="test")
-        dataset_info = load_dataset("./data_dir/DocVQA", "InfographicVQA", split="validation")
-        dataset_info_eval = load_dataset("./data_dir/DocVQA", "InfographicVQA", split="test")
-    else:
-        dataset_doc = load_dataset("lmms-lab/DocVQA", "DocVQA", split="validation")
-        dataset_doc_eval = load_dataset("lmms-lab/DocVQA", "DocVQA", split="test")
-        dataset_info = load_dataset("lmms-lab/DocVQA", "InfographicVQA", split="validation")
-        dataset_info_eval = load_dataset("lmms-lab/DocVQA", "InfographicVQA", split="test")
-    # concatenate the two datasets
-    dataset = concatenate_datasets([dataset_doc, dataset_info])
-    dataset_eval = concatenate_datasets([dataset_doc_eval, dataset_info_eval])
-    # sample 100 from eval dataset
-    dataset_eval = dataset_eval.shuffle(seed=42).select(range(200))
-    # rename question as query
-    dataset = dataset.rename_column("question", "query")
-    dataset_eval = dataset_eval.rename_column("question", "query")
-    # create new column image_filename that corresponds to ucsf_document_id if not None, else image_url
-    dataset = dataset.map(
-        lambda x: {"image_filename": x["ucsf_document_id"] if x["ucsf_document_id"] is not None else x["image_url"]}
-    )
-    dataset_eval = dataset_eval.map(
-        lambda x: {"image_filename": x["ucsf_document_id"] if x["ucsf_document_id"] is not None else x["image_url"]}
-    )
-    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
-    return ds_dict
-class TestSetFactory:
-    def __init__(self, dataset_path):
-        self.dataset_path = dataset_path
-    def __call__(self, *args, **kwargs):
-        dataset = load_dataset(self.dataset_path, split="test")
-        return dataset
-if __name__ == "__main__":
-    ds = TestSetFactory("vidore/tabfquad_test_subsampled")()
-    print(ds)

colpali_engine/utils/gpu_stats.py DELETED Viewed

@@ -1,24 +0,0 @@
-# cond import
-try:
-    from pynvml import *
-    def print_gpu_utilization():
-        nvmlInit()
-        handle = nvmlDeviceGetHandleByIndex(0)
-        info = nvmlDeviceGetMemoryInfo(handle)
-        print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")
-    def print_summary(result):
-        print(f"Time: {result.metrics['train_runtime']:.2f}")
-        print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
-        print_gpu_utilization()
-except ImportError:
-    print("pynvml not found. GPU stats will not be printed.")
-    def print_summary(result):
-        print(f"Time: {result.metrics['train_runtime']:.2f}")
-        print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
-    def print_gpu_utilization():
-        pass

colpali_engine/utils/image_from_page_utils.py DELETED Viewed

@@ -1,21 +0,0 @@
-import requests
-from PIL import Image
-def load_from_pdf(pdf_path: str):
-    from pdf2image import convert_from_path
-    images = convert_from_path(pdf_path)
-    return images
-def load_from_image_urls(urls: str):
-    images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
-    return images
-def load_from_dataset(dataset):
-    from datasets import load_dataset
-    dataset = load_dataset(dataset, split="test")
-    return dataset["image"]

colpali_engine/utils/image_utils.py DELETED Viewed

@@ -1,64 +0,0 @@
-"""
-Utility functions for working with images.
-"""
-import base64
-import io
-from PIL import Image
-def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image:
-    """
-    Scale an image to a new height while maintaining the aspect ratio.
-    """
-    # Calculate the scaling factor
-    width, height = image.size
-    aspect_ratio = width / height
-    new_width = int(new_height * aspect_ratio)
-    # Resize the image
-    scaled_image = image.resize((new_width, new_height))
-    return scaled_image
-def scale_to_max_dimension(image: Image.Image, max_dimension: int = 1024) -> Image.Image:
-    """
-    Scale an image to a maximum dimension while maintaining the aspect ratio.
-    """
-    # Get the dimensions of the image
-    width, height = image.size
-    max_original_dimension = max(width, height)
-    if max_original_dimension < max_dimension:
-        return image
-    # Calculate the scaling factor
-    aspect_ratio = max_dimension / max_original_dimension
-    new_width = int(width * aspect_ratio)
-    new_height = int(height * aspect_ratio)
-    # Resize the image
-    scaled_image = image.resize((new_width, new_height))
-    return scaled_image
-def get_base64_image(img: str | Image.Image, add_url_prefix: bool = True) -> str:
-    """
-    Convert an image (from a filepath or a PIL.Image object) to a JPEG-base64 string.
-    """
-    if isinstance(img, str):
-        img = Image.open(img)
-    elif isinstance(img, Image.Image):
-        pass
-    else:
-        raise ValueError("`img` must be a path to an image or a PIL Image object.")
-    buffered = io.BytesIO()
-    img.save(buffered, format="jpeg")
-    b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
-    return f"data:image/jpeg;base64,{b64_data}" if add_url_prefix else b64_data

colpali_engine/utils/iter_utils.py DELETED Viewed

@@ -1,42 +0,0 @@
-import sys
-def islice(iterable, *args):
-    """
-    Yield a slice of an iterable.
-    >>> islice('ABCDEFG', 2) → A B
-    >>> islice('ABCDEFG', 2, 4) → C D
-    >>> islice('ABCDEFG', 2, None) → C D E F G
-    >>> islice('ABCDEFG', 0, None, 2) → A C E G
-    """
-    s = slice(*args)
-    start, stop, step = s.start or 0, s.stop or sys.maxsize, s.step or 1
-    it = iter(range(start, stop, step))
-    try:
-        nexti = next(it)
-    except StopIteration:
-        # Consume *iterable* up to the *start* position.
-        for i, element in zip(range(start), iterable):
-            pass
-        return
-    try:
-        for i, element in enumerate(iterable):
-            if i == nexti:
-                yield element
-                nexti = next(it)
-    except StopIteration:
-        # Consume to *stop*.
-        for i, element in zip(range(i + 1, stop), iterable):
-            pass
-def batched(iterable, n: int):
-    """
-    Yield batches of n elements from an iterable.
-    >>> batched('ABCDEFG', 3) → ABC DEF G
-    """
-    if n < 1:
-        raise ValueError("n must be at least one")
-    it = iter(iterable)
-    while batch := tuple(islice(it, n)):
-        yield batch

colpali_engine/utils/pdf_utils.py DELETED Viewed

@@ -1,87 +0,0 @@
-import glob
-import os
-import random
-from pathlib import Path
-from pdf2image import convert_from_path
-from tqdm import tqdm
-random.seed(42)
-def convert_pdf_to_images(pdf_file: str, save_folder: str):
-    """
-    Convert each page of a pdf to a jpg image and save them in a folder.
-    Args:
-    - pdf_file (str): path to the pdf file
-    - save_folder (str): path to the folder where the images will be saved
-    """
-    images = convert_from_path(pdf_file)
-    for i, image in enumerate(images):
-        if not os.path.exists(save_folder):
-            os.makedirs(save_folder)
-        image.save(os.path.join(save_folder, f"page_{i+1}.jpg"), "JPEG")
-def convert_all_pdfs_to_images(path_to_folder: str, n_samples: int = 0):
-    """
-    Convert all pdfs in a folder and its subfolder to images and save them in a folder.
-    It will sample n_samples pdf files in each subfolder, allowing to have granularity on the number of pdf files to convert.
-    Args:
-    - path_to_folder (str): path to the folder containing the pdf files
-    - n_samples (int): number of pdf files to sample in each subfolder
-    directory structure:
-    - path_to_folder
-        - subfolder1
-            - pdf1
-            - pdf2
-            - ...
-        - subfolder2
-            - pdf1
-            - pdf2
-            - ...
-        - ...
-    """
-    # take n_samples pdf files in each subfolder : I want to take 10 pdf files from each subfolder
-    sub_dirs = [d for d in os.listdir(path_to_folder) if os.path.isdir(os.path.join(path_to_folder, d))]
-    sampled_files = []
-    for sub_dir in sub_dirs:
-        pdf_files = glob.glob(os.path.join(path_to_folder, sub_dir, "*.pdf"))
-        if (n_samples == 0) or (len(pdf_files) <= n_samples):
-            print(f"Taking all pdf files in {sub_dir}")
-            sampled_files.extend(pdf_files)
-        else:
-            print(f"Taking {n_samples} pdf files in {sub_dir}")
-            sampled_files.extend(random.sample(pdf_files, n_samples))
-    pdf_files = [str(file) for file in sampled_files]
-    # Create an empty text file that will contain the file paths of the corrupted pdf files
-    dirpath_corrupted = Path(path_to_folder) / "corrupted_pdf_files.txt"
-    dirpath_corrupted.parent.mkdir(parents=True, exist_ok=True)
-    with dirpath_corrupted.open("w") as f:
-        with tqdm(total=len(pdf_files)) as pbar:
-            for pdf_file in pdf_files:
-                pbar.set_description(f"Processing {pdf_file}")
-                save_folder = os.path.join("pages_extracted", *Path(pdf_file).parts[-2:])
-                if not os.path.exists(os.path.join(path_to_folder, save_folder)):
-                    try:
-                        convert_pdf_to_images(pdf_file, os.path.join(path_to_folder, save_folder))
-                    except Exception as e:
-                        print(f"Error converting {pdf_file}: {e}")
-                        f.write(pdf_file)
-                        f.write("\n")
-                pbar.update(1)
-    return

colpali_engine/utils/plot_utils.py DELETED Viewed

@@ -1,6 +0,0 @@
-import seaborn as sns
-def setup_seaborn():
-    sns.set_style("white")
-    sns.set_context("paper", font_scale=2)

colpali_engine/utils/torch_utils.py DELETED Viewed

@@ -1,18 +0,0 @@
-"""
-Utility functions for interpretability.
-"""
-import torch
-def get_torch_device() -> str:
-    """
-    Returns the device and dtype to be used for torch tensors.
-    """
-    if torch.cuda.is_available():
-        device = "cuda:0"
-    elif torch.backends.mps.is_available():  # for Apple Silicon
-        device = "mps"
-    else:
-        device = "cpu"
-    return device

colpali_engine/utils/train_colpali_engine_models.py DELETED Viewed

@@ -1,247 +0,0 @@
-# HuggingFace trainer
-import json
-import os
-from dataclasses import dataclass
-from typing import Callable, Dict, Optional
-import torch
-from datasets import concatenate_datasets
-from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from transformers import AutoTokenizer, Idefics2Processor, PreTrainedModel, PreTrainedTokenizer, TrainingArguments
-from colpali_engine.dataset.custom_collator import CustomCollator
-from colpali_engine.loss.colbert_loss import BiEncoderLoss, BiPairwiseCELoss, ColbertLoss, ColbertPairwiseCELoss
-from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
-from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator
-from colpali_engine.utils.gpu_stats import print_gpu_utilization, print_summary
-@dataclass
-class ColModelTrainingConfig:
-    model: PreTrainedModel
-    tr_args: TrainingArguments = None
-    output_dir: str = None
-    max_length: int = 256
-    run_eval: bool = True
-    run_train: bool = True
-    peft_config: Optional[LoraConfig] = None
-    add_suffix: bool = False
-    processor: Idefics2Processor = None
-    tokenizer: PreTrainedTokenizer = None
-    loss_func: Optional[Callable] = ColbertLoss()
-    dataset_loading_func: Optional[Callable] = None
-    eval_dataset_loader: Optional[Dict[str, Callable]] = None
-    pretrained_peft_model_name_or_path: Optional[str] = None
-    def __post_init__(self):
-        if self.output_dir is None:
-            sanitized_name = str(self.model.name_or_path).replace("/", "_")
-            self.output_dir = f"./models/{sanitized_name}"
-        if self.tr_args is None:
-            self.tr_args = TrainingArguments(output_dir=self.output_dir)
-        elif self.tr_args.output_dir is None:
-            self.tr_args.output_dir = self.output_dir
-        # cast if string
-        if isinstance(self.tr_args.learning_rate, str):
-            self.tr_args.learning_rate = float(self.tr_args.learning_rate)
-        self.tr_args.remove_unused_columns = False
-        if self.processor is None and self.tokenizer is None:
-            print("Using textual model tokenization")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model.name_or_path)
-        if self.pretrained_peft_model_name_or_path is not None:
-            self.model.load_adapter(self.pretrained_peft_model_name_or_path)
-            print(f"Loaded pretrained adapter from {self.pretrained_peft_model_name_or_path}")
-        if self.peft_config is not None:
-            print("Configurating PEFT model")
-            if self.processor is None:
-                # Might be deprecated - use the "else" branch
-                self.model = prepare_model_for_kbit_training(self.model)  # use_gradient_checkpointing=True
-                # self.model.enable_input_require_grads()
-                self.model = get_peft_model(self.model, self.peft_config)
-                self.model.print_trainable_parameters()
-            else:
-                # Ugly debugging hack
-                # if self.model.model.config.text_config.vocab_size == 32000:
-                #     print("DEBUG: Resizing token embeddings - This should not happen in a real scenario!")
-                #     self.model.model.text_model.resize_token_embeddings(32003)
-                #     self.model.model.vision_model.encoder.layers = self.model.model.vision_model.encoder.layers[0:2]
-                # self.model.enable_input_require_grads()
-                if self.pretrained_peft_model_name_or_path is None:
-                    self.model.add_adapter(self.peft_config)
-                    self.model.enable_adapters()
-                else:
-                    print(f"Adapter already loaded from {self.pretrained_peft_model_name_or_path}. Not overwriting.")
-    print_gpu_utilization()
-class ColModelTraining:
-    def __init__(self, config: ColModelTrainingConfig) -> None:
-        self.config = config
-        self.model = self.config.model
-        self.dataset = self.config.dataset_loading_func()
-        self.collator = CustomCollator(
-            processor=self.config.processor, tokenizer=self.config.tokenizer, max_length=self.config.max_length
-        )
-        self.current_git_hash = os.popen("git rev-parse HEAD").read().strip()
-        self.retriever_evaluator = CustomEvaluator(
-            is_multi_vector=(
-                isinstance(self.config.loss_func, ColbertLoss)
-                or isinstance(self.config.loss_func, ColbertPairwiseCELoss)
-            )
-        )
-    def train(self) -> None:
-        trainer = ContrastiveTrainer(
-            model=self.model,
-            train_dataset=self.dataset["train"],
-            eval_dataset=self.dataset["test"],
-            args=self.config.tr_args,
-            data_collator=self.collator,
-            loss_func=self.config.loss_func,
-            is_vision_model=self.config.processor is not None,
-        )
-        trainer.args.remove_unused_columns = False
-        result = trainer.train()
-        print_summary(result)
-    def eval_dataset(self, test_dataset):
-        self.model.eval()
-        # # debug
-        # if len(test_dataset) > 200:
-        #     test_dataset = test_dataset.select(range(0, 100))
-        idx_with_query = [idx for idx, sample in enumerate(test_dataset["query"]) if sample is not None]
-        idx_without_query = [idx for idx, sample in enumerate(test_dataset["query"]) if sample is None]
-        dataloader_with_query = DataLoader(
-            test_dataset.select(idx_with_query),
-            batch_size=self.config.tr_args.per_device_eval_batch_size,
-            shuffle=False,
-            collate_fn=self.collator,
-        )
-        dataloader_without_query = DataLoader(
-            test_dataset.select(idx_without_query),
-            batch_size=self.config.tr_args.per_device_eval_batch_size,
-            shuffle=False,
-            collate_fn=self.collator,
-        )
-        # dataset is ordered so that non-null queries come first
-        test_dataset = concatenate_datasets(
-            [test_dataset.select(idx_with_query), test_dataset.select(idx_without_query)]
-        )
-        relevant_docs = {}
-        docidx_2_docid = {}
-        qsidx_2_query = []
-        for idx, sample in enumerate(test_dataset):
-            doc_id = sample["image_filename"] if "image_filename" in sample else str(hash(sample["doc"]))
-            # query_id = sample["query_id"] if "query_id" in sample else str(hash(sample["query"]))
-            if sample["query"] is not None:
-                relevant_docs[str(idx)] = {doc_id: 1}
-                qsidx_2_query.append(str(idx))
-            docidx_2_docid[str(idx)] = doc_id
-        qs = []
-        ps = []
-        device = self.model.device
-        with (torch.no_grad()):
-            for dataloader in [dataloader_with_query, dataloader_without_query]:
-                for batch in tqdm(dataloader):
-                    if "doc_pixel_values" not in batch:
-                        doc = self.model(
-                            input_ids=batch["doc_input_ids"].to(device),
-                            attention_mask=batch["doc_attention_mask"].to(device),
-                        )
-                    else:
-                        if "doc_pixel_attention_mask" in batch:
-                            doc = self.model(
-                                input_ids=batch["doc_input_ids"].to(device),
-                                attention_mask=batch["doc_attention_mask"].to(device),
-                                pixel_values=batch["doc_pixel_values"].to(device),
-                                pixel_attention_mask=batch["doc_pixel_attention_mask"].to(device),
-                            )
-                        else:
-                            doc = self.model(
-                                input_ids=batch["doc_input_ids"].to(device),
-                                attention_mask=batch["doc_attention_mask"].to(device),
-                                pixel_values=batch["doc_pixel_values"].to(device),
-                            )
-                    ps.extend(list(torch.unbind(doc.to("cpu"))))
-                    if "query_input_ids" in batch:
-                        query = self.model(
-                            input_ids=batch["query_input_ids"].to(device),
-                            attention_mask=batch["query_attention_mask"].to(device),
-                        )
-                        # variable len
-                        qs.extend(list(torch.unbind(query.to("cpu"))))
-        print("Embeddings computed, evaluating")
-        scores = self.retriever_evaluator.evaluate(qs, ps)
-        # scores is 2d array of shape (n_queries, n_docs)
-        # turn it into a dict
-        results = {}
-        assert scores.shape[0] == len(qsidx_2_query)
-        for idx, scores_per_query in enumerate(scores):
-            results[qsidx_2_query[idx]] = {
-                docidx_2_docid[str(docidx)]: float(score) for docidx, score in enumerate(scores_per_query)
-            }
-        # evaluate
-        metrics = self.retriever_evaluator.compute_metrics(relevant_docs, results)
-        print(metrics)
-        return metrics
-    def eval(self) -> None:
-        print("Evaluating on validation set")
-        metrics = self.eval_dataset(self.dataset["test"])
-        print(f"Metrics for validation set: {metrics}")
-        all_metrics = {"validation_set": metrics}
-        if self.config.eval_dataset_loader is not None:
-            for test_name, test_dataset_loading_func in self.config.eval_dataset_loader.items():
-                print(f"Evaluating {test_name}")
-                test_ds = test_dataset_loading_func()
-                metrics = self.eval_dataset(test_ds)
-                all_metrics[test_name] = metrics
-                print(f"Metrics for {test_name}: {metrics}")
-                # checkpoint dumps
-                with open(f"{self.config.output_dir}/results.json", "w") as f:
-                    json.dump(all_metrics, f)
-        # save results as json
-        with open(f"{self.config.output_dir}/results.json", "w") as f:
-            json.dump(all_metrics, f)
-    def save(self, config_file):
-        # save model
-        self.model.save_pretrained(self.config.output_dir)
-        if self.config.tokenizer is not None:
-            self.config.tokenizer.save_pretrained(self.config.output_dir)
-        if self.config.processor is not None:
-            self.config.processor.save_pretrained(self.config.output_dir)  # save config
-        # copy-paste the yml file with os
-        os.system(f"cp {config_file} {self.config.output_dir}/training_config.yml")
-        # save git hash of the commit at beginning of training
-        with open(f"{self.config.output_dir}/git_hash.txt", "w") as f:
-            f.write(self.current_git_hash)

colpali_engine/utils/wrapper.py DELETED Viewed

@@ -1,83 +0,0 @@
-import importlib
-from colpali_engine.models.clip_baselines import ColSigLIP, SigLIP
-from colpali_engine.models.colbert_architectures import (
-    BiBERT,
-    BiXLMRoBERTa,
-    ColBERT,
-    ColCamembert,
-    ColLlama,
-    ColXLMRoBERTa,
-)
-from colpali_engine.models.idefics_colbert_architecture import BiIdefics, ColIdefics
-from colpali_engine.models.paligemma_colbert_architecture import (
-    BiNewSiglip,
-    BiPaliLast,
-    BiPaliMean,
-    ColNewSiglip,
-    ColPali,
-)
-if importlib.util.find_spec("transformers") is not None:
-    from transformers import AutoProcessor, AutoTokenizer
-    from transformers.tokenization_utils import PreTrainedTokenizer
-    class AutoProcessorWrapper:
-        def __new__(cls, *args, **kwargs):
-            return AutoProcessor.from_pretrained(*args, **kwargs)
-    class AutoTokenizerWrapper(PreTrainedTokenizer):
-        def __new__(cls, *args, **kwargs):
-            return AutoTokenizer.from_pretrained(*args, **kwargs)
-    class AutoColModelWrapper:
-        def __new__(cls, *args, **kwargs):
-            pretrained_model_name_or_path = None
-            if args:
-                pretrained_model_name_or_path = args[0]
-            elif kwargs:
-                pretrained_model_name_or_path = kwargs["pretrained_model_name_or_path"]
-            training_objective = kwargs.pop("training_objective", "colbertv1")
-            if "camembert" in pretrained_model_name_or_path:
-                return ColCamembert.from_pretrained(*args, **kwargs)
-            elif "xlm-roberta" in pretrained_model_name_or_path:
-                if training_objective == "biencoder":
-                    return BiXLMRoBERTa.from_pretrained(*args, **kwargs)
-                return ColXLMRoBERTa.from_pretrained(*args, **kwargs)
-            elif (
-                "llama" in pretrained_model_name_or_path.lower() or "croissant" in pretrained_model_name_or_path.lower()
-            ):
-                return ColLlama.from_pretrained(*args, **kwargs)
-            elif "idefics2" in pretrained_model_name_or_path:
-                if training_objective == "biencoder":
-                    return BiIdefics.from_pretrained(*args, **kwargs)
-                return ColIdefics.from_pretrained(*args, **kwargs)
-            elif "siglip" in pretrained_model_name_or_path:
-                if training_objective == "biencoder_mean":
-                    return SigLIP.from_pretrained(*args, **kwargs)
-                elif training_objective == "colbertv1":
-                    return ColSigLIP.from_pretrained(*args, **kwargs)
-                else:
-                    raise ValueError(f"Training objective {training_objective} not recognized")
-            elif "paligemma" in pretrained_model_name_or_path:
-                if training_objective == "biencoder_mean":
-                    return BiPaliMean.from_pretrained(*args, **kwargs)
-                elif training_objective == "biencoder_last":
-                    return BiPaliLast.from_pretrained(*args, **kwargs)
-                elif training_objective == "biencoder_mean_vision":
-                    return BiNewSiglip.from_pretrained(*args, **kwargs)
-                elif training_objective == "colbertv1_vision":
-                    return ColNewSiglip.from_pretrained(*args, **kwargs)
-                elif training_objective == "colbertv1":
-                    return ColPali.from_pretrained(*args, **kwargs)
-                else:
-                    raise ValueError(f"Training objective {training_objective} not recognized")
-            else:
-                if training_objective == "biencoder":
-                    return BiBERT.from_pretrained(*args, **kwargs)
-                return ColBERT.from_pretrained(*args, **kwargs)
-else:
-    raise ModuleNotFoundError("Transformers must be loaded")