diff --git a/colpali-main/.env.dist b/colpali-main/.env.dist
new file mode 100644
index 0000000000000000000000000000000000000000..2e8d4d39fd00b0c9a5e43d7c64cc0820e79a897e
--- /dev/null
+++ b/colpali-main/.env.dist
@@ -0,0 +1,5 @@
+HF_TOKEN=
+HF_DATASETS_CACHE=
+
+VERTEX_PROJECT=
+VERTEX_LOCATION=
diff --git a/colpali-main/.gitattributes b/colpali-main/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..d08d846a4fdc0b92a8e8b9b615b06f4978da1445
--- /dev/null
+++ b/colpali-main/.gitattributes
@@ -0,0 +1,31 @@
+*.jsonl filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.ipynb filter=lfs diff=lfs merge=lfs -text
+
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
\ No newline at end of file
diff --git a/colpali-main/.gitignore b/colpali-main/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d5bb3faf053df45809d5fccb0a3168bc92cb8876
--- /dev/null
+++ b/colpali-main/.gitignore
@@ -0,0 +1,179 @@
+# Custom
+.DS_Store
+.env
+.litellm_cache/
+data/litellm_cache_captionning/
+.idea
+.venv/
+colbert/models/
+logs/
+data/downloaded_datasets/rimes_raw_dataset/
+models/
+!colpali_engine/models
+data/
+!*/configs/data/
+data_dir/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/*.png
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/colpali-main/.python-version b/colpali-main/.python-version
new file mode 100644
index 0000000000000000000000000000000000000000..375f5cabfe6cd1337c375dfa0dbc7fbd3180edb9
--- /dev/null
+++ b/colpali-main/.python-version
@@ -0,0 +1 @@
+3.11.6
diff --git a/colpali-main/LICENSE b/colpali-main/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..c67416730fed7bab154d250168c820fcc27eff97
--- /dev/null
+++ b/colpali-main/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Manuel Faysse, Hugues Sibille, Tony Wu
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/colpali-main/README.md b/colpali-main/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c26dfa7c309b6b3bb8bf7d9695f2b1532d5bb1d1
--- /dev/null
+++ b/colpali-main/README.md
@@ -0,0 +1,222 @@
+# ColPali: Efficient Document Retrieval with Vision Language Models
+
+
+[[Blog]](https://huggingface.co/blog/manu/colpali)
+[[Paper]](https://arxiv.org/abs/2407.01449)
+[[ColPali Model card]](https://huggingface.co/vidore/colpali)
+[[ViDoRe Benchmark]](https://huggingface.co/vidore)
+<!---[[Colab example]]()-->
+[[HuggingFace Demo]](https://huggingface.co/spaces/manu/ColPali-demo)
+
+
+## Associated Paper
+
+**ColPali: Efficient Document Retrieval with Vision Language Models**
+Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo
+
+This repository contains the code for training custom Colbert retriever models.
+Notably, we train colbert with LLMs (decoders) as well as Image Language models !
+
+## Installation
+
+### From git
+```bash
+pip install git+https://github.com/illuin-tech/colpali
+```
+
+### From source
+```bash
+git clone https://github.com/illuin-tech/colpali
+mv colpali
+pip install -r requirements.txt
+```
+
+## Usage
+
+Example usage of the model is shown in the `scripts` directory.
+
+```bash
+# hackable example script to adapt
+python scripts/infer/run_inference_with_python.py
+```
+
+
+```python
+import torch
+import typer
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoProcessor
+from PIL import Image
+
+from colpali_engine.models.paligemma_colbert_architecture import ColPali
+from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator
+from colpali_engine.utils.colpali_processing_utils import process_images, process_queries
+from colpali_engine.utils.image_from_page_utils import load_from_dataset
+
+
+def main() -> None:
+    """Example script to run inference with ColPali"""
+    # Load model
+    model_name = "vidore/colpali"
+    model = ColPali.from_pretrained("google/paligemma-3b-mix-448", torch_dtype=torch.bfloat16, device_map="cuda").eval()
+    model.load_adapter(model_name)
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    # select images -> load_from_pdf(<pdf_path>),  load_from_image_urls(["<url_1>"]), load_from_dataset(<path>)
+    images = load_from_dataset("vidore/docvqa_test_subsampled")
+    queries = ["From which university does James V. Fiorca come ?", "Who is the japanese prime minister?"]
+
+    # run inference - docs
+    dataloader = DataLoader(
+        images,
+        batch_size=4,
+        shuffle=False,
+        collate_fn=lambda x: process_images(processor, x),
+    )
+    ds = []
+    for batch_doc in tqdm(dataloader):
+        with torch.no_grad():
+            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
+            embeddings_doc = model(**batch_doc)
+        ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
+
+    # run inference - queries
+    dataloader = DataLoader(
+        queries,
+        batch_size=4,
+        shuffle=False,
+        collate_fn=lambda x: process_queries(processor, x, Image.new("RGB", (448, 448), (255, 255, 255))),
+    )
+
+    qs = []
+    for batch_query in dataloader:
+        with torch.no_grad():
+            batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
+            embeddings_query = model(**batch_query)
+        qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
+
+    # run evaluation
+    retriever_evaluator = CustomEvaluator(is_multi_vector=True)
+    scores = retriever_evaluator.evaluate(qs, ds)
+    print(scores.argmax(axis=1))
+
+
+if __name__ == "__main__":
+    typer.run(main)
+```
+
+Detais are also given in the model card for the base Colpali model on HuggingFace: [ColPali Model card](https://huggingface.co/vidore/colpali).
+
+## Training
+
+```bash
+USE_LOCAL_DATASET=0 python scripts/train/train_colbert.py scripts/configs/siglip/train_siglip_model_debug.yaml
+```
+
+or 
+
+```bash
+accelerate launch scripts/train/train_colbert.py scripts/configs/train_colidefics_model.yaml
+```
+
+### Configurations
+All training arguments can be set through a configuration file.
+The configuration file is a yaml file that contains all the arguments for training.
+
+The construction is as follows:
+
+```python
+@dataclass
+class ColModelTrainingConfig:
+    model: PreTrainedModel
+    tr_args: TrainingArguments = None
+    output_dir: str = None
+    max_length: int = 256
+    run_eval: bool = True
+    run_train: bool = True
+    peft_config: Optional[LoraConfig] = None
+    add_suffix: bool = False
+    processor: Idefics2Processor = None
+    tokenizer: PreTrainedTokenizer = None
+    loss_func: Optional[Callable] = ColbertLoss()
+    dataset_loading_func: Optional[Callable] = None
+    eval_dataset_loader: Optional[Dict[str, Callable]] = None
+    pretrained_peft_model_name_or_path: Optional[str] = None
+```
+### Example
+
+An example configuration file is:
+
+```yaml
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+```
+
+
+#### Local training
+
+```bash
+USE_LOCAL_DATASET=0 python scripts/train/train_colbert.py scripts/configs/siglip/train_siglip_model_debug.yaml
+```
+
+
+#### SLURM
+
+```bash
+sbatch --nodes=1 --cpus-per-task=16 --mem-per-cpu=32GB --time=20:00:00 --gres=gpu:1  -p gpua100 --job-name=colidefics --output=colidefics.out --error=colidefics.err --wrap="accelerate launch scripts/train/train_colbert.py  scripts/configs/train_colidefics_model.yaml"
+
+sbatch --nodes=1  --time=5:00:00 -A cad15443 --gres=gpu:8  --constraint=MI250 --job-name=colpali --wrap="python scripts/train/train_colbert.py scripts/configs/train_colpali_model.yaml"
+```
+
+## CITATION
+
+```bibtex
+@misc{faysse2024colpaliefficientdocumentretrieval,
+      title={ColPali: Efficient Document Retrieval with Vision Language Models}, 
+      author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo},
+      year={2024},
+      eprint={2407.01449},
+      archivePrefix={arXiv},
+      primaryClass={cs.IR},
+      url={https://arxiv.org/abs/2407.01449}, 
+}
+```
\ No newline at end of file
diff --git a/colpali-main/colpali_engine/__init__.py b/colpali-main/colpali_engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/colpali-main/colpali_engine/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2edc71e0a26eeaedc5321d2694a6e4f37e67f44
Binary files /dev/null and b/colpali-main/colpali_engine/__pycache__/__init__.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/dataset/__init__.py b/colpali-main/colpali_engine/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/colpali-main/colpali_engine/dataset/custom_collator.py b/colpali-main/colpali_engine/dataset/custom_collator.py
new file mode 100644
index 0000000000000000000000000000000000000000..5254f31a9b870f2fdf691b312bb13f58e493bae3
--- /dev/null
+++ b/colpali-main/colpali_engine/dataset/custom_collator.py
@@ -0,0 +1,244 @@
+from transformers import PreTrainedTokenizer, ProcessorMixin
+
+
+class CustomCollator:
+    def __init__(
+        self,
+        processor: ProcessorMixin = None,
+        tokenizer: PreTrainedTokenizer = None,
+        max_length: int = 2048,
+        add_suffix: bool = False,
+    ):
+        self.processor = processor
+        self.tokenizer = tokenizer
+        self.image_token_id = None
+        self.max_length = max_length
+        self.suffix = ""
+        if add_suffix:
+            self.suffix = "\n" * 10
+
+        if tokenizer is None and processor is None:
+            raise ValueError("Either processor or tokenizer should be provided.")
+
+        if self.processor is not None:
+            if self.processor.__class__.__name__ != "SiglipProcessor":
+                self.image_token_id = self.processor.tokenizer.additional_special_tokens_ids[
+                    self.processor.tokenizer.additional_special_tokens.index("<image>")
+                ]
+
+            if self.tokenizer is not None:
+                raise ValueError("Only one of processor or tokenizer should be provided.")
+
+        if self.tokenizer and self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    def __call__(self, examples):
+        if self.processor is None:
+            return self.forward_text(examples)
+        if self.processor.__class__.__name__ == "Idefics2Processor":
+            return self.forward_vision_idefics(examples)
+        if self.processor.__class__.__name__ == "PaliGemmaProcessor":
+            return self.forward_vision_pali(examples)
+        if self.processor.__class__.__name__ == "SiglipProcessor":
+            return self.forward_vision_siglip(examples)
+        raise ValueError("Processor not supported")
+
+    def forward_text(self, examples):
+        texts_doc = []
+        texts_query = []
+        for example in examples:
+            text_query = example["query"] + self.suffix
+            text_doc = example["doc"]
+
+            texts_doc.append(text_doc.strip())
+            texts_query.append(text_query.strip())
+
+        batch_doc = self.tokenizer(
+            texts_doc, max_length=self.max_length, padding="longest", truncation=True, return_tensors="pt"
+        )
+        batch_query = self.tokenizer(
+            texts_query, max_length=self.max_length, padding="longest", truncation=True, return_tensors="pt"
+        )
+
+        # prefix each key with "doc_" or "query_" to avoid key conflicts
+        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
+        batch_query = {f"query_{k}": v for k, v in batch_query.items()}
+        batch_doc.update(batch_query)
+
+        return batch_doc
+
+    def forward_vision_idefics(self, examples):
+        texts_doc = []
+        texts_query = []
+        images = []
+        for example in examples:
+            image = example["image"]
+
+            text_query = None
+            if example["query"] is not None:
+                query = example["query"]
+                messages_query = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"Question: {query}<end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance>",
+                            },
+                        ],
+                    },
+                ]
+                text_query = self.processor.apply_chat_template(messages_query, add_generation_prompt=False).strip()
+
+            messages_doc = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "Describe the image."},
+                        {"type": "image"},
+                    ],
+                },
+            ]
+
+            text_doc = self.processor.apply_chat_template(messages_doc, add_generation_prompt=False)
+
+            texts_doc.append(text_doc.strip())
+            texts_query.append(text_query)
+            images.append([image])
+
+        batch_doc = self.processor(
+            text=texts_doc, images=images, return_tensors="pt", padding="longest", max_length=self.max_length
+        )
+
+        batch_query = None
+        if all([t is None for t in texts_query]):
+            print("All queries are None. Returning None for all queries.")
+        elif any([t is None for t in texts_query]):
+            raise ValueError("Some queries are None. This collator does not support None queries yet.")
+        else:
+            batch_query = self.processor(
+                text=texts_query, return_tensors="pt", padding="longest", max_length=self.max_length
+            )
+
+        # prefix each key with "doc_" or "query_" to avoid key conflicts
+        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
+
+        if batch_query is not None:
+            batch_query = {f"query_{k}": v for k, v in batch_query.items()}
+            batch_doc.update(batch_query)
+
+        return batch_doc
+
+    def forward_vision_pali(self, examples):
+        texts_doc = []
+        texts_query = []
+        images = []
+        for example in examples:
+
+            if example["image"] is None:
+                raise ValueError("Image is None - This collator does not support None images yet.")
+
+            image = example["image"].convert("RGB")
+            images.append(image)
+            texts_doc.append("Describe the image.")
+
+            if example["query"] is None:
+                texts_query.append(None)
+            else:
+                query = example["query"]
+                query = f"Question: {query}<unused0><unused0><unused0><unused0><unused0>"
+                texts_query.append(query)
+
+        batch_doc = self.processor(
+            text=texts_doc,
+            images=images,
+            return_tensors="pt",
+            padding="longest",
+            max_length=self.max_length + self.processor.image_seq_length,
+        )
+
+        batch_query = None
+        # check if some but not all queries are None
+        if all([t is None for t in texts_query]):
+            print("All queries are None. Returning None for all queries.")
+        elif any([t is None for t in texts_query]):
+            raise ValueError("Some queries are None. This collator does not support None queries yet.")
+        else:
+            batch_query = self.processor(
+                images=images,  # NOTE: the image is not used in batch_query but it is required for calling the processor
+                text=texts_query,
+                return_tensors="pt",
+                padding="longest",
+                max_length=self.max_length + self.processor.image_seq_length,
+            )
+            del batch_query["pixel_values"]
+            batch_query["input_ids"] = batch_query["input_ids"][..., self.processor.image_seq_length :]
+            batch_query["attention_mask"] = batch_query["attention_mask"][..., self.processor.image_seq_length :]
+
+        # prefix each key with "doc_" or "query_" to avoid key conflicts
+        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
+
+        if batch_query is not None:
+            batch_query = {f"query_{k}": v for k, v in batch_query.items()}
+            batch_doc.update(batch_query)
+
+        return batch_doc
+
+    def forward_vision_siglip(self, examples):
+        texts_doc = []
+        texts_query = []
+        images = []
+        for example in examples:
+
+            if example["image"] is None:
+                raise ValueError("Image is None - This collator does not support None images yet.")
+
+            image = example["image"].convert("RGB")
+            images.append(image)
+            texts_doc.append("Describe the image.")
+
+            if example["query"] is None:
+                texts_query.append(None)
+            else:
+                query = f"Question: {example['query']}"
+                texts_query.append(query)
+
+        batch_doc = self.processor(
+            text=texts_doc,
+            images=images,
+            return_tensors="pt",
+            padding="max_length",
+            truncation=True,
+        )
+
+        batch_query = None
+        # check if some but not all queries are None
+        if all([t is None for t in texts_query]):
+            # print("All queries are None.")
+            pass
+        elif any([t is None for t in texts_query]):
+            raise ValueError("Some queries are None. This collator does not support None queries yet.")
+        else:
+            batch_query = self.processor(
+                images=images,
+                text=texts_query,
+                return_tensors="pt",
+                padding="max_length",
+                max_length=self.max_length,
+                truncation=True,
+            )
+            del batch_query["pixel_values"]
+
+        # prefix each key with "doc_" or "query_" to avoid key conflicts
+        batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()}
+
+        if batch_query is not None:
+            batch_query = {f"query_{k}": v for k, v in batch_query.items()}
+            batch_doc.update(batch_query)
+            # add attention mask for queries
+            batch_doc["query_attention_mask"] = batch_doc["query_input_ids"].ne(0).long()
+
+        # add attention mask for docs
+        batch_doc["doc_attention_mask"] = batch_doc["doc_input_ids"].ne(0).long()
+
+        return batch_doc
diff --git a/colpali-main/colpali_engine/dataset/hf_dataset_names.py b/colpali-main/colpali_engine/dataset/hf_dataset_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..27ca88090671effdbbd31dbd359dbbe1cfb63bb4
--- /dev/null
+++ b/colpali-main/colpali_engine/dataset/hf_dataset_names.py
@@ -0,0 +1,52 @@
+from enum import Enum
+
+
+class TrainDatasets(Enum):
+    """
+    Dataset names for the training datasets used in HuggingFace Datasets.
+    """
+
+    government_reports = "vidore/syntheticDocQA_government_reports_train"
+    healthcare_industry = "vidore/syntheticDocQA_healthcare_industry_train"
+    energy = "vidore/syntheticDocQA_energy_train"
+    artificial_intelligence = "vidore/syntheticDocQA_artificial_intelligence_train"
+    arxivqa = "vidore/arxivqa_train"
+    docvqa = "vidore/docvqa_train"
+    infovqa = "vidore/infovqa_train"
+    tatqa = "vidore/tatqa_train"
+
+    @staticmethod
+    def get_synthetic_datasets():
+        return [
+            TrainDatasets.government_reports,
+            TrainDatasets.healthcare_industry,
+            TrainDatasets.energy,
+            TrainDatasets.artificial_intelligence,
+        ]
+
+
+class TestImagesDirpath(Enum):
+    """
+    Dataset names for the test datasets used in HuggingFace Datasets.
+    """
+
+    government_reports = "data/government_reports"
+    healthcare_industry = "data/healthcare_industry"
+    energy = "data/energy"
+    artificial_intelligence = "data/scrapped_pdfs_split/pages_extracted/artificial_intelligence_test"
+    arxivqa = "data/arxivqa"
+    docvqa = "data/docvqa"
+    infovqa = "data/infovqa"
+    tatqa = "data/tatqa"
+
+
+class CaptionedSyntheticDatasets(Enum):
+    """
+    Dataset names for the captioned synthetic datasets used in HuggingFace Datasets.
+    """
+
+    shift = "vidore/baseline_cap_shiftproject_test"
+
+
+class SyntheticDocQATest(Enum):
+    shift = "vidore/shiftproject_test"
diff --git a/colpali-main/colpali_engine/evaluation/__init__.py b/colpali-main/colpali_engine/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a92c4b6306f451b5db4bef85ca3d7e0840b2e431
--- /dev/null
+++ b/colpali-main/colpali_engine/evaluation/__init__.py
@@ -0,0 +1 @@
+from .eval_manager import EvalManager
diff --git a/colpali-main/colpali_engine/evaluation/eval_manager.py b/colpali-main/colpali_engine/evaluation/eval_manager.py
new file mode 100644
index 0000000000000000000000000000000000000000..30d1ff67485ad35bb5f5b7ce2311b53b7195e871
--- /dev/null
+++ b/colpali-main/colpali_engine/evaluation/eval_manager.py
@@ -0,0 +1,178 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, ClassVar, Dict, Optional
+
+import pandas as pd
+
+
+class EvalManager:
+    """
+    Stores evaluation results for various datasets and metrics.
+
+    The data is stored in a pandas DataFrame with a MultiIndex for columns.
+    The first level of the MultiIndex is the dataset name and the second level is the metric name.
+
+    Usage:
+    >>> evaluator = Evaluator.from_dirpath("data/evaluation_results/")
+    >>> print(evaluator.data)
+
+    """
+
+    model_col: ClassVar[str] = "model"
+    dataset_col: ClassVar[str] = "dataset"
+    metric_col: ClassVar[str] = "metric"
+
+    def __init__(self, data: Optional[pd.DataFrame] = None):
+        if data is None:
+            data = pd.DataFrame()
+        self._df = data
+        self._df.index = self._df.index.rename(EvalManager.model_col)
+
+    def __str__(self) -> str:
+        return self.data.__str__()
+
+    @staticmethod
+    def from_dict(data: Dict[Any, Any]) -> EvalManager:
+        """
+        Load evaluation results from a dictionary.
+
+        Expected format:
+        {
+            "model1": pd.read_json(path1).T.stack(),
+            "model2": pd.read_json(path2).T.stack(),
+        }
+
+        """
+        df = pd.DataFrame.from_dict(data, orient="index")
+        return EvalManager(df)
+
+    @staticmethod
+    def from_json(path: str | Path) -> EvalManager:
+        datapath = Path(path)
+        if not datapath.is_file():
+            raise FileNotFoundError(f"{path} is not a file")
+        data = {}
+        data[datapath.stem] = pd.read_json(datapath).T.stack()  # pylint: disable=no-member
+        return EvalManager.from_dict(data)
+
+    @staticmethod
+    def from_dir(datadir: str | Path) -> EvalManager:
+        datadir_ = Path(datadir)
+        if not datadir_.is_dir():
+            raise FileNotFoundError(f"{datadir} is not a directory")
+
+        eval_files = list(datadir_.glob("*.json"))
+
+        data = {}
+
+        for filepath in eval_files:
+            data[filepath.stem] = pd.read_json(filepath).T.stack()  # pylint: disable=no-member
+
+        return EvalManager.from_dict(data)
+
+    @staticmethod
+    def from_csv(path: str | Path) -> EvalManager:
+        """
+        Load evaluation results from a CSV file.
+        """
+        try:
+            df = pd.read_csv(path, index_col=0, header=[0, 1])
+            return EvalManager(df)
+        except Exception as e:
+            print(f"Error loading {path}: {e}")
+            raise e
+
+    @property
+    def data(self) -> pd.DataFrame:
+        """
+        Returns the evaluation results as a pandas DataFrame.
+        """
+        return self._df.copy()
+
+    @property
+    def models(self) -> pd.Index:
+        """
+        Returns the models for which there are evaluation results.
+        """
+        return self.data.index
+
+    @property
+    def datasets(self) -> pd.Index:
+        """
+        Returns the datasets for which there are evaluation results.
+        """
+        return self.data.columns.get_level_values(0).unique()
+
+    @property
+    def metrics(self) -> pd.Index:
+        """
+        Returns the metrics for which there are evaluation results.
+        """
+        return self.data.columns.get_level_values(1)
+
+    @staticmethod
+    def melt(df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Melt a suitable DataFrame (e.g. returned by `get_df_for_dataset` and
+        `get_df_for_metric`) into a 'long' format.
+        """
+        return df.T.reset_index(names=[EvalManager.dataset_col, EvalManager.metric_col]).melt(
+            id_vars=[EvalManager.dataset_col, EvalManager.metric_col],
+            var_name=EvalManager.model_col,
+            value_name="score",
+        )
+
+    @property
+    def melted(self) -> pd.DataFrame:
+        """
+        Returns the evaluation results as a 'melted' DataFrame.
+        Useful for plotting with seaborn.
+        """
+        return EvalManager.melt(self.data)
+
+    def get_df_for_model(self, model: str) -> pd.DataFrame:
+        if model not in self.data.index:
+            raise ValueError(f"Model {model} not found in the evaluation results")
+        return self.data.loc[[model], :]  # type: ignore
+
+    def get_df_for_dataset(self, dataset: str) -> pd.DataFrame:
+        if dataset not in self.datasets:
+            raise ValueError(f"Dataset {dataset} not found in the evaluation results")
+        return self.data.loc[:, (dataset, slice(None))]  # type: ignore
+
+    def get_df_for_metric(self, metric: str) -> pd.DataFrame:
+        if metric not in self.metrics:
+            raise ValueError(f"Metric {metric} not found in the evaluation results")
+        return self.data.loc[:, (slice(None), metric)]  # type: ignore
+
+    def sort_by_dataset(self, ascending: bool = True) -> EvalManager:
+        """
+        Sort the evaluation results by dataset name.
+        """
+        df = self.data.T.sort_index(level=0, ascending=ascending).T
+        return EvalManager(df)
+
+    def sort_by_metric(self, ascending: bool = True) -> EvalManager:
+        """
+        Sort the evaluation results by metric name.
+        """
+        df = self.data.T.sort_index(level=1, ascending=ascending).T
+        return EvalManager(df)
+
+    def sort_columns(self, ascending: bool = True) -> EvalManager:
+        """
+        Sort the evaluation results by dataset name and then by metric name.
+        """
+        df = self.data.T.sort_index(level=[0, 1], ascending=ascending).T
+        return EvalManager(df)
+
+    def to_csv(self, path: str | Path):
+        """
+        Save the evaluation results to a CSV file.
+
+        Using `Evaluation.from_csv(path_to_saved_csv)` will load the evaluation results back into memory.
+        """
+        savepath = Path(path)
+        savepath.parent.mkdir(parents=True, exist_ok=True)
+        self.data.to_csv(savepath)
diff --git a/colpali-main/colpali_engine/interpretability/__init__.py b/colpali-main/colpali_engine/interpretability/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e442b0606e379ce675d742b3a996db97a367c62
--- /dev/null
+++ b/colpali-main/colpali_engine/interpretability/__init__.py
@@ -0,0 +1,4 @@
+from .plot_utils import *
+from .processor import *
+from .torch_utils import *
+from .vit_configs import *
diff --git a/colpali-main/colpali_engine/interpretability/gen_interpretability_plots.py b/colpali-main/colpali_engine/interpretability/gen_interpretability_plots.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f377482f2cd2d01f6cdccdd2b26d91e4b490f34
--- /dev/null
+++ b/colpali-main/colpali_engine/interpretability/gen_interpretability_plots.py
@@ -0,0 +1,113 @@
+import pprint
+from dataclasses import asdict, dataclass
+from pathlib import Path
+from uuid import uuid4
+
+import matplotlib.pyplot as plt
+import torch
+from einops import rearrange
+from PIL import Image
+from tqdm import trange
+
+from colpali_engine.interpretability.plot_utils import plot_patches
+from colpali_engine.interpretability.processor import ColPaliProcessor
+from colpali_engine.interpretability.torch_utils import normalize_attention_map_per_query_token
+from colpali_engine.interpretability.vit_configs import VIT_CONFIG
+from colpali_engine.models.paligemma_colbert_architecture import ColPali
+
+OUTDIR_INTERPRETABILITY = Path("outputs/interpretability")
+
+
+@dataclass
+class InterpretabilityInput:
+    query: str
+    image: Image.Image
+    start_idx_token: int
+    end_idx_token: int
+
+
+def generate_interpretability_plots(
+    model: ColPali,
+    processor: ColPaliProcessor,
+    query: str,
+    image: Image.Image,
+    savedir: str | Path | None = None,
+    add_special_prompt_to_doc: bool = True,
+) -> None:
+
+    # Sanity checks
+    if len(model.active_adapters()) != 1:
+        raise ValueError("The model must have exactly one active adapter.")
+
+    if model.config.name_or_path not in VIT_CONFIG:
+        raise ValueError("The model must be referred to in the VIT_CONFIG dictionary.")
+    vit_config = VIT_CONFIG[model.config.name_or_path]
+
+    # Handle savepath
+    if not savedir:
+        savedir = OUTDIR_INTERPRETABILITY / str(uuid4())
+        print(f"No savepath provided. Results will be saved to: `{savedir}`.")
+    elif isinstance(savedir, str):
+        savedir = Path(savedir)
+    savedir.mkdir(parents=True, exist_ok=True)
+
+    # Resize the image to square
+    input_image_square = image.resize((vit_config.resolution, vit_config.resolution))
+
+    # Preprocess the inputs
+    input_text_processed = processor.process_text(query).to(model.device)
+    input_image_processed = processor.process_image(image, add_special_prompt=add_special_prompt_to_doc).to(
+        model.device
+    )
+
+    # Forward pass
+    with torch.no_grad():
+        output_text = model.forward(**asdict(input_text_processed))  # (1, n_text_tokens, hidden_dim)
+
+    # NOTE: `output_image`` will have shape:
+    # (1, n_patch_x * n_patch_y, hidden_dim) if `add_special_prompt_to_doc` is False
+    # (1, n_patch_x * n_patch_y + n_special_tokens, hidden_dim) if `add_special_prompt_to_doc` is True
+    with torch.no_grad():
+        output_image = model.forward(**asdict(input_image_processed))
+
+    if add_special_prompt_to_doc:  # remove the special tokens
+        output_image = output_image[
+            :, : processor.processor.image_seq_length, :
+        ]  # (1, n_patch_x * n_patch_y, hidden_dim)
+
+    output_image = rearrange(
+        output_image, "b (h w) c -> b h w c", h=vit_config.n_patch_per_dim, w=vit_config.n_patch_per_dim
+    )  # (1, n_patch_x, n_patch_y, hidden_dim)
+
+    # Get the unnormalized attention map
+    attention_map = torch.einsum(
+        "bnk,bijk->bnij", output_text, output_image
+    )  # (1, n_text_tokens, n_patch_x, n_patch_y)
+    attention_map_normalized = normalize_attention_map_per_query_token(
+        attention_map
+    )  # (1, n_text_tokens, n_patch_x, n_patch_y)
+    attention_map_normalized = attention_map_normalized.float()
+
+    # Get text token information
+    n_tokens = input_text_processed.input_ids.size(1)
+    text_tokens = processor.tokenizer.tokenize(processor.decode(input_text_processed.input_ids[0]))
+    print("Text tokens:")
+    pprint.pprint(text_tokens)
+    print("\n")
+
+    for token_idx in trange(1, n_tokens - 1, desc="Iterating over tokens..."):  # exclude the <bos> and the "\n" tokens
+        fig, axis = plot_patches(
+            input_image_square,
+            vit_config.patch_size,
+            vit_config.resolution,
+            patch_opacities=attention_map_normalized[0, token_idx, :, :],
+            style="dark_background",
+        )
+
+        fig.suptitle(f"Token #{token_idx}: `{text_tokens[token_idx]}`", color="white", fontsize=14)
+        savepath = savedir / f"token_{token_idx}.png"
+        fig.savefig(savepath)
+        print(f"Saved attention map for token {token_idx} (`{text_tokens[token_idx]}`) to `{savepath}`.\n")
+        plt.close(fig)
+
+    return
diff --git a/colpali-main/colpali_engine/interpretability/plot_utils.py b/colpali-main/colpali_engine/interpretability/plot_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..13b41865a5c786acf1848f433a34948b20962293
--- /dev/null
+++ b/colpali-main/colpali_engine/interpretability/plot_utils.py
@@ -0,0 +1,131 @@
+from typing import Any, Dict, Optional, Tuple, cast
+
+import matplotlib.pyplot as plt
+import numpy as np
+import numpy.typing as npt
+import seaborn as sns
+import torch
+from PIL import Image
+
+MAX_OPACITY = 255
+
+
+def plot_patches(
+    img: Image.Image,
+    patch_size: int,
+    image_resolution: int,
+    patch_opacities: Optional[npt.NDArray | torch.Tensor] = None,
+    figsize: Tuple[int, int] = (8, 8),
+    style: Dict[str, Any] | str | None = None,
+) -> Tuple[plt.Figure, plt.Axes]:
+    """
+    Plot patches of a square image.
+    Set `style` to "dark_background" if your image has a light background.
+    """
+
+    # Get the number of patches
+    if image_resolution % patch_size != 0:
+        raise ValueError("The image resolution must be divisible by the patch size.")
+    num_patches = image_resolution // patch_size
+
+    # Default style
+    if style is None:
+        style = {}
+
+    # Sanity checks
+    if patch_opacities is not None:
+        if isinstance(patch_opacities, torch.Tensor):
+            patch_opacities = cast(npt.NDArray, patch_opacities.cpu().numpy())
+        if patch_opacities.shape != (num_patches, num_patches):
+            raise ValueError("The shape of the patch_opacities tensor is not correct.")
+        if not np.all((0 <= patch_opacities) & (patch_opacities <= 1)):
+            raise ValueError("The patch_opacities tensor must have values between 0 and 1.")
+
+    # If the image is not square, raise an error
+    if img.size[0] != img.size[1]:
+        raise ValueError("The image must be square.")
+
+    # Get the image as a numpy array
+    img_array = np.array(img.convert("RGBA"))  # (H, W, C) where the last channel is the alpha channel
+
+    # Create a figure
+    with plt.style.context(style):
+        fig, axis = plt.subplots(num_patches, num_patches, figsize=figsize)
+
+        # Plot the patches
+        for i in range(num_patches):
+            for j in range(num_patches):
+                patch = img_array[i * patch_size : (i + 1) * patch_size, j * patch_size : (j + 1) * patch_size, :]
+                # Set the opacity of the patch
+                if patch_opacities is not None:
+                    patch[:, :, -1] = round(patch_opacities[i, j] * MAX_OPACITY)
+                axis[i, j].imshow(patch)
+                axis[i, j].axis("off")
+
+        fig.subplots_adjust(wspace=0.1, hspace=0.1)
+
+    fig.tight_layout()
+
+    return fig, axis
+
+
+def plot_attention_heatmap(
+    img: Image.Image,
+    patch_size: int,
+    image_resolution: int,
+    attention_map: npt.NDArray | torch.Tensor,
+    figsize: Tuple[int, int] = (8, 8),
+    style: Dict[str, Any] | str | None = None,
+    show_colorbar: bool = False,
+    show_axes: bool = False,
+) -> Tuple[plt.Figure, plt.Axes]:
+    """
+    Plot a heatmap of the attention map over the image.
+    The image must be square and `attention_map` must be normalized between 0 and 1.
+    """
+
+    # Get the number of patches
+    if image_resolution % patch_size != 0:
+        raise ValueError("The image resolution must be divisible by the patch size.")
+    num_patches = image_resolution // patch_size
+
+    # Default style
+    if style is None:
+        style = {}
+
+    # Sanity checks
+    if isinstance(attention_map, torch.Tensor):
+        attention_map = cast(npt.NDArray, attention_map.cpu().numpy())
+    if attention_map.shape != (num_patches, num_patches):
+        raise ValueError("The shape of the patch_opacities tensor is not correct.")
+    if not np.all((0 <= attention_map) & (attention_map <= 1)):
+        raise ValueError("The patch_opacities tensor must have values between 0 and 1.")
+
+    # If the image is not square, raise an error
+    if img.size[0] != img.size[1]:
+        raise ValueError("The image must be square.")
+
+    # Get the image as a numpy array
+    img_array = np.array(img.convert("RGBA"))  # (H, W, C) where the last channel is the alpha channel
+
+    # Get the attention map as a numpy array
+    attention_map_image = Image.fromarray((attention_map * 255).astype("uint8")).resize(
+        img.size, Image.Resampling.BICUBIC
+    )
+
+    # Create a figure
+    with plt.style.context(style):
+        fig, ax = plt.subplots(figsize=figsize)
+        ax.imshow(img_array)
+        im = ax.imshow(
+            attention_map_image,
+            cmap=sns.color_palette("mako", as_cmap=True),
+            alpha=0.5,
+        )
+        if show_colorbar:
+            fig.colorbar(im)
+        if not show_axes:
+            ax.set_axis_off()
+        fig.tight_layout()
+
+    return fig, ax
diff --git a/colpali-main/colpali_engine/interpretability/processor.py b/colpali-main/colpali_engine/interpretability/processor.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ac6d5911eac0083d3f1f30031ac5918588ee20
--- /dev/null
+++ b/colpali-main/colpali_engine/interpretability/processor.py
@@ -0,0 +1,116 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, cast
+
+import torch
+from PIL import Image
+from transformers import LlamaTokenizerFast, PaliGemmaProcessor
+
+
+@dataclass
+class ColPaliTextInput:
+    input_ids: torch.Tensor
+    attention_mask: torch.Tensor
+
+    def to(self, device: torch.device) -> ColPaliTextInput:
+        return ColPaliTextInput(
+            input_ids=self.input_ids.to(device),
+            attention_mask=self.attention_mask.to(device),
+        )
+
+
+@dataclass
+class ColPaliImageInput:
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    attention_mask: torch.Tensor
+
+    def to(self, device: str | torch.device) -> ColPaliImageInput:
+        return ColPaliImageInput(
+            input_ids=self.input_ids.to(device),
+            pixel_values=self.pixel_values.to(device),
+            attention_mask=self.attention_mask.to(device),
+        )
+
+
+class ColPaliProcessor:
+    def __init__(self, processor: PaliGemmaProcessor):
+        self.processor = processor
+        self.tokenizer = cast(LlamaTokenizerFast, self.processor.tokenizer)  # type: ignore
+
+    @staticmethod
+    def from_pretrained(model_name: str) -> ColPaliProcessor:
+        return ColPaliProcessor(processor=cast(PaliGemmaProcessor, PaliGemmaProcessor.from_pretrained(model_name)))
+
+    def process_text(
+        self,
+        text: str | List[str],
+        padding: str = "longest",
+        return_tensors: str = "pt",
+        add_special_tokens: bool = True,
+    ) -> ColPaliTextInput:
+        """
+        Process text inputs for the model.
+        If `add_special_tokens` is True (default), the text will be prepended with the <bos> token and appended with " \n".
+        """
+        if add_special_tokens:
+            if isinstance(text, str):
+                text = self.tokenizer.bos_token + text + "\n"
+            elif isinstance(text, list):
+                text = [self.tokenizer.bos_token + t + "\n" for t in text]
+            else:
+                raise ValueError("text must be a string or a list of strings.")
+
+        batch_output = self.tokenizer(
+            text, padding=padding, return_tensors=return_tensors, add_special_tokens=add_special_tokens
+        )
+
+        return ColPaliTextInput(
+            input_ids=cast(torch.Tensor, batch_output["input_ids"]),
+            attention_mask=cast(torch.Tensor, batch_output["attention_mask"]),
+        )
+
+    def process_image(
+        self,
+        image: Image.Image | List[Image.Image],
+        padding: str = "longest",
+        do_convert_rgb: bool = True,
+        return_tensors: str = "pt",
+        add_special_prompt: bool = True,
+    ) -> ColPaliImageInput:
+        # NOTE: The special prompt was used at training time,
+        special_prompt = "Describe the image." if add_special_prompt else None
+        if isinstance(image, Image.Image):
+            text_input = [special_prompt]
+        elif isinstance(image, list):
+            text_input = [special_prompt] * len(image)
+        else:
+            raise ValueError("image must be a PIL Image or a list of PIL Images.")
+
+        batch_output = self.processor(
+            text=text_input,
+            images=image,
+            padding=padding,
+            do_convert_rgb=do_convert_rgb,
+            return_tensors=return_tensors,
+        )
+
+        if add_special_prompt:
+            return ColPaliImageInput(
+                input_ids=batch_output["input_ids"],
+                pixel_values=batch_output["pixel_values"],
+                attention_mask=batch_output["attention_mask"],
+            )
+        else:
+            return ColPaliImageInput(
+                input_ids=batch_output["input_ids"][:, : self.processor.image_seq_length],
+                pixel_values=batch_output["pixel_values"][:, : self.processor.image_seq_length],
+                attention_mask=batch_output["attention_mask"][:, : self.processor.image_seq_length],
+            )
+
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
diff --git a/colpali-main/colpali_engine/interpretability/torch_utils.py b/colpali-main/colpali_engine/interpretability/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2960f3e85e4329ad1def921a0b698bc4536d431d
--- /dev/null
+++ b/colpali-main/colpali_engine/interpretability/torch_utils.py
@@ -0,0 +1,60 @@
+import logging
+
+import torch
+
+logger = logging.getLogger(__name__)
+
+EPSILON = 1e-10
+
+
+def normalize_attention_map_per_query_token(x: torch.Tensor) -> torch.Tensor:
+    """
+    Normalizes the attention map for ColPali for each query token.
+    The output tensor will have values in the range [0, 1] and the
+    same shape as the input tensor.
+
+    Args:
+        x: The attention map tensor of shape (batch_size, n_text_tokens, n_patch_x, n_patch_y).
+    """
+    if x.ndim != 4:
+        raise ValueError("The input tensor must have 4 dimensions.")
+
+    # Compute the minimum values along the last two dimensions (n_patch_x, n_patch_y)
+    min_vals = x.min(dim=-1, keepdim=True)[0].min(dim=-2, keepdim=True)[0]
+
+    # Compute the maximum values along the last two dimensions (n_patch_x, n_patch_y)
+    max_vals = x.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0]
+
+    # Normalize the tensor
+    x_normalized = (x - min_vals) / (max_vals - min_vals + EPSILON)  # Adding a small epsilon to avoid division by zero
+
+    return x_normalized
+
+
+def normalize_attention_map_per_query(x: torch.Tensor) -> torch.Tensor:
+    """
+    Normalizes the attention map for ColPali for each query token.
+    The output tensor will have values in the range [0, 1] and the
+    same shape as the input tensor.
+
+    Args:
+        x: The attention map tensor of shape (batch_size, n_text_tokens, n_patch_x, n_patch_y).
+    """
+    # Log warning
+    logger.warning(
+        "This function should not be used for ColPali because it doesn't make sense to normalize the attention map across the text tokens."
+    )
+
+    if x.ndim != 4:
+        raise ValueError("The input tensor must have 4 dimensions.")
+
+    # Compute the minimum values along the last three dimensions (n_text_tokens, n_patch_x, n_patch_y)
+    min_vals = x.min(dim=-1, keepdim=True)[0].min(dim=-2, keepdim=True)[0].min(dim=-3, keepdim=True)[0]
+
+    # Compute the maximum values along the last three dimensions (n_text_tokens, n_patch_x, n_patch_y)
+    max_vals = x.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0].max(dim=-3, keepdim=True)[0]
+
+    # Normalize the tensor
+    x_normalized = (x - min_vals) / (max_vals - min_vals + EPSILON)  # Adding a small epsilon to avoid division by zero
+
+    return x_normalized
diff --git a/colpali-main/colpali_engine/interpretability/vit_configs.py b/colpali-main/colpali_engine/interpretability/vit_configs.py
new file mode 100644
index 0000000000000000000000000000000000000000..66b4bceb2d030ef206ed36697a27b789f062b96a
--- /dev/null
+++ b/colpali-main/colpali_engine/interpretability/vit_configs.py
@@ -0,0 +1,23 @@
+from dataclasses import dataclass
+from typing import Dict
+
+
+@dataclass
+class ViTConfig:
+    patch_size: int
+    resolution: int
+
+    @property
+    def n_patch_per_dim(self) -> int:
+        if self.resolution % self.patch_size != 0:
+            raise ValueError(f"Resolution {self.resolution} is not divisible by patch size {self.patch_size}")
+        return self.resolution // self.patch_size
+
+
+VIT_CONFIG: Dict[str, ViTConfig] = {
+    "google/siglip-so400m-patch14-384": ViTConfig(patch_size=14, resolution=384),
+    "timm/ViT-SO400M-14-SigLIP-384": ViTConfig(patch_size=14, resolution=384),
+    "google/paligemma-3b-mix-448": ViTConfig(
+        patch_size=14, resolution=448
+    ),  # based on "timm/ViT-SO400M-14-SigLIP-384" with increased resolution
+}
diff --git a/colpali-main/colpali_engine/loss/__init__.py b/colpali-main/colpali_engine/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9613afd7a8f9ea0b6294e4e0df23e6951dcd073a
--- /dev/null
+++ b/colpali-main/colpali_engine/loss/__init__.py
@@ -0,0 +1 @@
+from .colbert_loss import ColbertLoss
diff --git a/colpali-main/colpali_engine/loss/colbert_loss.py b/colpali-main/colpali_engine/loss/colbert_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52a216b31f3aade638b2adb08f76033f5d7ddb6
--- /dev/null
+++ b/colpali-main/colpali_engine/loss/colbert_loss.py
@@ -0,0 +1,122 @@
+import torch
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+
+
+class BiEncoderLoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ce_loss = CrossEntropyLoss()
+        # self.pooling_strategy = pooling_strategy
+
+    def forward(self, query_embeddings, doc_embeddings):
+        """
+        query_embeddings: (batch_size, dim)
+        doc_embeddings: (batch_size, dim)
+        """
+
+        scores = torch.einsum("bd,cd->bc", query_embeddings, doc_embeddings)
+
+        loss_rowwise = self.ce_loss(scores, torch.arange(scores.shape[0], device=scores.device))
+        # loss_columnwise = self.ce_loss(scores.T, torch.arange(scores.shape[1], device=scores.device))
+        # loss = (loss_rowwise + loss_columnwise) / 2
+        return loss_rowwise
+
+
+class ColbertLoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ce_loss = CrossEntropyLoss()
+
+    def forward(self, query_embeddings, doc_embeddings):
+        """
+        query_embeddings: (batch_size, num_query_tokens, dim)
+        doc_embeddings: (batch_size, num_doc_tokens, dim)
+        """
+
+        scores = torch.einsum("bnd,csd->bcns", query_embeddings, doc_embeddings).max(dim=3)[0].sum(dim=2)
+
+        # scores = torch.zeros((query_embeddings.shape[0], doc_embeddings.shape[0]), device=query_embeddings.device)
+        # for i in range(query_embeddings.shape[0]):
+        #     for j in range(doc_embeddings.shape[0]):
+        #         # step 1 - dot product --> (s1,s2)
+        #         q2d_scores = torch.matmul(query_embeddings[i], doc_embeddings[j].T)
+        #         # step 2 -> max on doc  --> (s1)
+        #         q_scores = torch.max(q2d_scores, dim=1)[0]
+        #         # step 3 --> sum the max score --> (1)
+        #         sum_q_score = torch.sum(q_scores)
+        #         # step 4 --> assert is scalar
+        #         scores[i, j] = sum_q_score
+
+        # assert (scores_einsum - scores < 0.0001).all().item()
+
+        loss_rowwise = self.ce_loss(scores, torch.arange(scores.shape[0], device=scores.device))
+        # TODO: comparing between queries might not make sense since it's a sum over the length of the query
+        # loss_columnwise = self.ce_loss(scores.T, torch.arange(scores.shape[1], device=scores.device))
+        # loss = (loss_rowwise + loss_columnwise) / 2
+        return loss_rowwise
+
+
+class ColbertPairwiseCELoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ce_loss = CrossEntropyLoss()
+
+    def forward(self, query_embeddings, doc_embeddings):
+        """
+        query_embeddings: (batch_size, num_query_tokens, dim)
+        doc_embeddings: (batch_size, num_doc_tokens, dim)
+
+        Positive scores are the diagonal of the scores matrix.
+        """
+
+        # Compute the ColBERT scores
+        scores = (
+            torch.einsum("bnd,csd->bcns", query_embeddings, doc_embeddings).max(dim=3)[0].sum(dim=2)
+        )  # (batch_size, batch_size)
+
+        # Positive scores are the diagonal of the scores matrix.
+        pos_scores = scores.diagonal()  # (batch_size,)
+
+        # Negative score for a given query is the maximum of the scores against all all other pages.
+        # NOTE: We exclude the diagonal by setting it to a very low value: since we know the maximum score is 1,
+        # we can subtract 1 from the diagonal to exclude it from the maximum operation.
+        neg_scores = scores - torch.eye(scores.shape[0], device=scores.device) * 1e6  # (batch_size, batch_size)
+        neg_scores = neg_scores.max(dim=1)[0]  # (batch_size,)
+
+        # Compute the loss
+        # The loss is computed as the negative log of the softmax of the positive scores
+        # relative to the negative scores.
+        # This can be simplified to log-sum-exp of negative scores minus the positive score
+        # for numerical stability.
+        # torch.vstack((pos_scores, neg_scores)).T.softmax(1)[:, 0].log()*(-1)
+        loss = F.softplus(neg_scores - pos_scores).mean()
+
+        return loss
+
+
+class BiPairwiseCELoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.ce_loss = CrossEntropyLoss()
+
+    def forward(self, query_embeddings, doc_embeddings):
+        """
+        query_embeddings: (batch_size, dim)
+        doc_embeddings: (batch_size, dim)
+        """
+
+        scores = torch.einsum("bd,cd->bc", query_embeddings, doc_embeddings)
+
+        pos_scores = scores.diagonal()
+        neg_scores = scores - torch.eye(scores.shape[0], device=scores.device) * 1e6
+        neg_scores = neg_scores.max(dim=1)[0]
+
+        # Compute the loss
+        # The loss is computed as the negative log of the softmax of the positive scores
+        # relative to the negative scores.
+        # This can be simplified to log-sum-exp of negative scores minus the positive score
+        # for numerical stability.
+        loss = F.softplus(neg_scores - pos_scores).mean()
+
+        return loss
diff --git a/colpali-main/colpali_engine/models/__init__.py b/colpali-main/colpali_engine/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/colpali-main/colpali_engine/models/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/models/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2575064035fd713786886734ecc4f4a6bbfb0f0
Binary files /dev/null and b/colpali-main/colpali_engine/models/__pycache__/__init__.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc b/colpali-main/colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c120dc43b7851c9f6b59991d280525995f1db46
Binary files /dev/null and b/colpali-main/colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/models/clip_baselines.py b/colpali-main/colpali_engine/models/clip_baselines.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f256b074a22cc3fe2199b85f3f6f00a1442d6d7
--- /dev/null
+++ b/colpali-main/colpali_engine/models/clip_baselines.py
@@ -0,0 +1,144 @@
+import os
+from typing import Optional
+
+import torch
+from transformers import SiglipModel
+
+
+class SigLIP(SiglipModel):
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        return self.forward_branch(*args, **kwargs)
+
+    def forward_branch(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ):
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is not None:
+            # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+
+            outputs = self.vision_model(
+                pixel_values=pixel_values.to(dtype=self.dtype),
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+            )
+
+        else:
+            outputs = self.text_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        embeds = outputs[1]
+
+        # normalized features
+        embeds = embeds / embeds.norm(p=2, dim=-1, keepdim=True)
+        return embeds
+
+
+class ColSigLIP(SiglipModel):
+    def __init__(self, config):
+        super(ColSigLIP, self).__init__(config=config)
+        self.dim = 128
+        self.custom_vision_proj = torch.nn.Linear(self.config.vision_config.hidden_size, self.dim)
+        self.custom_text_proj = torch.nn.Linear(self.config.text_config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        return self.forward_branch(*args, **kwargs)
+
+    def forward_branch(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ):
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is not None:
+            # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
+
+            outputs = self.vision_model(
+                pixel_values=pixel_values.to(dtype=self.dtype),
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+            )
+
+            last_hidden_states = outputs.last_hidden_state
+
+            proj = self.custom_vision_proj(last_hidden_states)
+            # normalize l2 norm
+            proj = proj / proj.norm(dim=-1, keepdim=True)
+
+        else:
+            outputs = self.text_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            last_hidden_states = outputs.last_hidden_state
+
+            proj = self.custom_text_proj(last_hidden_states)
+            # normalize l2 norm
+            proj = proj / proj.norm(dim=-1, keepdim=True)
+            proj = proj * attention_mask.unsqueeze(-1)
+
+        # normalized features
+        return proj
diff --git a/colpali-main/colpali_engine/models/colbert_architectures.py b/colpali-main/colpali_engine/models/colbert_architectures.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b0eedcc5dcbeff4f0b7b8dd9a23ba8841ced1f
--- /dev/null
+++ b/colpali-main/colpali_engine/models/colbert_architectures.py
@@ -0,0 +1,177 @@
+from torch import nn
+from transformers import (
+    BertModel,
+    BertPreTrainedModel,
+    CamembertModel,
+    CamembertPreTrainedModel,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    XLMRobertaModel,
+    XLMRobertaPreTrainedModel,
+)
+
+
+class ColCamembert(CamembertPreTrainedModel):
+    def __init__(self, config):
+        super(ColCamembert, self).__init__(config=config)
+        self.roberta: CamembertPreTrainedModel = CamembertModel(config)
+        self.dim = 128
+        self.linear = nn.Linear(self.roberta.config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Camenbert and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.roberta(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        proj = self.linear(last_hidden_states)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
+
+
+class ColXLMRoBERTa(XLMRobertaPreTrainedModel):
+    def __init__(self, config):
+        super(ColXLMRoBERTa, self).__init__(config=config)
+        self.roberta: XLMRobertaPreTrainedModel = XLMRobertaModel(config)
+        self.dim = 128
+        self.linear = nn.Linear(self.roberta.config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Roberta and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.roberta(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        proj = self.linear(last_hidden_states)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
+
+
+class BiXLMRoBERTa(XLMRobertaPreTrainedModel):
+    def __init__(self, config):
+        super(BiXLMRoBERTa, self).__init__(config=config)
+        self.roberta: XLMRobertaPreTrainedModel = XLMRobertaModel(config)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Roberta and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.roberta(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        # pooling - mean tokens that have attention mask == 1
+        proj = last_hidden_states * kwargs["attention_mask"].unsqueeze(-1)
+        proj = proj.sum(dim=1) / kwargs["attention_mask"].sum(dim=1, keepdim=True)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        return proj
+
+
+class ColBERT(BertPreTrainedModel):
+    def __init__(self, config):
+        super(ColBERT, self).__init__(config=config)
+        self.bert: BertModel = BertModel(config)
+        self.dim = 128
+        self.linear = nn.Linear(self.bert.config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through BERT and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.bert(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        proj = self.linear(last_hidden_states)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
+
+
+class BiBERT(BertPreTrainedModel):
+    def __init__(self, config):
+        super(BiBERT, self).__init__(config=config)
+        self.bert: BertModel = BertModel(config)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through BERT and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.bert(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        # pooling - mean tokens that have attention mask == 1
+        proj = last_hidden_states * kwargs["attention_mask"].unsqueeze(-1)
+        proj = proj.sum(dim=1) / kwargs["attention_mask"].sum(dim=1, keepdim=True)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        return proj
+
+
+class ColLlama(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super(ColLlama, self).__init__(config=config)
+        self.model: LlamaModel = LlamaModel(config)
+        self.dim = 128
+        self.linear = nn.Linear(self.model.config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.model(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        proj = self.linear(last_hidden_states)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
diff --git a/colpali-main/colpali_engine/models/idefics_colbert_architecture.py b/colpali-main/colpali_engine/models/idefics_colbert_architecture.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fe67163d572f6c91d2840b0a43ccb9d29f4bdd3
--- /dev/null
+++ b/colpali-main/colpali_engine/models/idefics_colbert_architecture.py
@@ -0,0 +1,57 @@
+from torch import nn
+from transformers import Idefics2Model, Idefics2PreTrainedModel
+
+
+class BiIdefics(Idefics2PreTrainedModel):
+    def __init__(self, config):
+        super(BiIdefics, self).__init__(config=config)
+        self.model: Idefics2Model = Idefics2Model(config)
+        self.pooling_strategy = "last"
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.model(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        # pooling - last token
+        proj = last_hidden_states[:, -1, :]
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        return proj
+
+
+class ColIdefics(Idefics2PreTrainedModel):
+    def __init__(self, config):
+        super(ColIdefics, self).__init__(config=config)
+        self.model: Idefics2Model = Idefics2Model(config)
+        self.dim = 128
+        self.linear = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.model(*args, **kwargs)
+        last_hidden_states = outputs[0]  # (batch_size, sequence_length, hidden_size)
+        proj = self.linear(last_hidden_states)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
diff --git a/colpali-main/colpali_engine/models/paligemma_colbert_architecture.py b/colpali-main/colpali_engine/models/paligemma_colbert_architecture.py
new file mode 100644
index 0000000000000000000000000000000000000000..b167c6a437cfba550eb304775c7acd8d46c1e579
--- /dev/null
+++ b/colpali-main/colpali_engine/models/paligemma_colbert_architecture.py
@@ -0,0 +1,191 @@
+import torch
+from torch import nn
+from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration, PaliGemmaPreTrainedModel
+
+
+class BiPaliLast(PaliGemmaPreTrainedModel):
+    def __init__(self, config):
+        super(BiPaliLast, self).__init__(config=config)
+        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
+        self.pooling_strategy = "last"
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.model(*args, output_hidden_states=True, **kwargs)
+        last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+        # pooling - last token
+        proj = last_hidden_states[:, -1, :]
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        return proj
+
+
+class BiPaliMean(PaliGemmaPreTrainedModel):
+    def __init__(self, config):
+        super(BiPaliMean, self).__init__(config=config)
+        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
+        self.pooling_strategy = "mean"
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.model(*args, output_hidden_states=True, **kwargs)
+        last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+        # pooling -mean on attention mask==1
+        proj = torch.sum(last_hidden_states * kwargs["attention_mask"].unsqueeze(-1), dim=1) / torch.sum(
+            kwargs["attention_mask"], dim=1, keepdim=True
+        )
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        return proj
+
+
+class ColPali(PaliGemmaPreTrainedModel):
+    def __init__(self, config):
+        super(ColPali, self).__init__(config=config)
+        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
+        self.dim = 128
+        self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        outputs = self.model(*args, output_hidden_states=True, **kwargs)
+        last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+        proj = self.custom_text_proj(last_hidden_states)
+        # normalize l2 norm
+        proj = proj / proj.norm(dim=-1, keepdim=True)
+        proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
+
+
+class ColNewSiglip(PaliGemmaPreTrainedModel):
+    def __init__(self, config):
+        super(ColNewSiglip, self).__init__(config=config)
+        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
+        self.dim = 128
+        self.custom_image_proj = nn.Linear(self.model.config.vision_config.projection_dim, self.dim)
+        self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        # outputs = self.model(*args, output_hidden_states=True, **kwargs)
+        if "pixel_values" in kwargs:
+            image_features = self.vision_model_output(*args, **kwargs)
+            # print(f"Doc: {image_features.shape}")
+            proj = self.custom_image_proj(image_features)
+            # print(f"Doc proj: {proj.shape}")
+            proj = proj / proj.norm(dim=-1, keepdim=True)
+        else:
+            outputs = self.model(*args, output_hidden_states=True, **kwargs)
+            last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+            # print(f"Query: {last_hidden_states.shape}")
+            proj = self.custom_text_proj(last_hidden_states)
+            # print(f"Query proj: {proj.shape}")
+            # normalize l2 norm
+            proj = proj / proj.norm(dim=-1, keepdim=True)
+            proj = proj * kwargs["attention_mask"].unsqueeze(-1)
+        return proj
+
+    def vision_model_output(self, input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, **kwargs):
+
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        # 2. Merge text and images
+        if pixel_values is not None and input_ids.shape[1] != 1:
+            image_outputs = self.model.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.model.multi_modal_projector(selected_image_feature)
+
+            return image_features
+
+        raise ValueError("pixel_values is None or input_ids.shape[1] == 1")
+
+
+class BiNewSiglip(PaliGemmaPreTrainedModel):
+    def __init__(self, config):
+        super(BiNewSiglip, self).__init__(config=config)
+        self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config)
+        self.main_input_name = "doc_input_ids"
+
+    def forward(self, *args, **kwargs):
+        """
+        Forward pass through Llama and the linear layer for dimensionality reduction
+
+        Args:
+        - input_ids (torch.LongTensor): The input tokens tensor.
+        - attention_mask (torch.LongTensor): The attention mask tensor.
+
+        Returns:
+        - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim)
+        """
+        # outputs = self.model(*args, output_hidden_states=True, **kwargs)
+        if "pixel_values" in kwargs:
+            image_features = self.vision_model_output(*args, **kwargs)
+            # print(f"Doc: {image_features.shape}")
+            # pool image features
+            proj = torch.mean(image_features, dim=1)
+            # print(f"Doc proj: {proj.shape}")
+            norm = proj.norm(dim=-1, keepdim=True)
+            proj = proj / norm
+        else:
+            outputs = self.model(*args, output_hidden_states=True, **kwargs)
+            last_hidden_states = outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+            # pooling -mean on attention mask==1
+
+            proj = torch.sum(last_hidden_states * kwargs["attention_mask"].unsqueeze(-1), dim=1) / torch.sum(
+                kwargs["attention_mask"], dim=1, keepdim=True
+            )
+            # print(f"Query proj: {proj.shape}")
+            norm = proj.norm(dim=-1, keepdim=True)
+            proj = proj / norm
+        return proj
+
+    def vision_model_output(self, input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, **kwargs):
+
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        # 2. Merge text and images
+        if pixel_values is not None and input_ids.shape[1] != 1:
+            image_outputs = self.model.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.model.multi_modal_projector(selected_image_feature)
+
+            return image_features
+
+        raise ValueError("pixel_values is None or input_ids.shape[1] == 1")
diff --git a/colpali-main/colpali_engine/trainer/__init__.py b/colpali-main/colpali_engine/trainer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/colpali-main/colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de280e48c5c65802553843c15bc0a3926ad87f1a
Binary files /dev/null and b/colpali-main/colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc b/colpali-main/colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e512781d51b374d7d86bf21c11e830cfe7a6ed5f
Binary files /dev/null and b/colpali-main/colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/trainer/contrastive_trainer.py b/colpali-main/colpali_engine/trainer/contrastive_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1126ba2298d3f3b41937b7b85bf8f641375025c
--- /dev/null
+++ b/colpali-main/colpali_engine/trainer/contrastive_trainer.py
@@ -0,0 +1,64 @@
+import torch
+from transformers import Trainer
+
+
+class ContrastiveTrainer(Trainer):
+    def __init__(self, loss_func, is_vision_model, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.loss_func = loss_func
+        self.is_vision_model = is_vision_model
+
+    def compute_loss(self, model, inputs, return_outputs=False):
+        query_outputs = model(input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"])
+        if self.is_vision_model:
+            if "doc_pixel_attention_mask" not in inputs:
+                doc_outputs = model(
+                    input_ids=inputs["doc_input_ids"],
+                    attention_mask=inputs["doc_attention_mask"],
+                    pixel_values=inputs["doc_pixel_values"],
+                )
+            else:
+                doc_outputs = model(
+                    input_ids=inputs["doc_input_ids"],
+                    attention_mask=inputs["doc_attention_mask"],
+                    pixel_values=inputs["doc_pixel_values"],
+                    pixel_attention_mask=inputs["doc_pixel_attention_mask"],
+                )
+        else:
+            doc_outputs = model(input_ids=inputs["doc_input_ids"], attention_mask=inputs["doc_attention_mask"])
+
+        loss = self.loss_func(query_outputs, doc_outputs)
+        return (loss, (query_outputs, doc_outputs)) if return_outputs else loss
+
+    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=True):
+        """This function is used to generate predictions and return the loss for the given inputs."""
+        if not prediction_loss_only:
+            raise ValueError("prediction_step is only called with prediction_loss_only=True")
+
+        with torch.no_grad():
+            if self.is_vision_model:
+                if "doc_pixel_attention_mask" not in inputs:
+                    doc_outputs = model(
+                        input_ids=inputs["doc_input_ids"],
+                        attention_mask=inputs["doc_attention_mask"],
+                        pixel_values=inputs["doc_pixel_values"],
+                    )
+                else:
+                    doc_outputs = model(
+                        input_ids=inputs["doc_input_ids"],
+                        attention_mask=inputs["doc_attention_mask"],
+                        pixel_values=inputs["doc_pixel_values"],
+                        pixel_attention_mask=inputs["doc_pixel_attention_mask"],
+                    )
+                query_outputs = model(
+                    input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"]
+                )
+            else:
+
+                query_outputs = model(
+                    input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"]
+                )
+                doc_outputs = model(input_ids=inputs["doc_input_ids"], attention_mask=inputs["doc_attention_mask"])
+
+            loss = self.loss_func(query_outputs, doc_outputs)
+            return loss, None, None
diff --git a/colpali-main/colpali_engine/trainer/retrieval_evaluator.py b/colpali-main/colpali_engine/trainer/retrieval_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..023cc4d0d9f934266fadeeb811253b98146d408f
--- /dev/null
+++ b/colpali-main/colpali_engine/trainer/retrieval_evaluator.py
@@ -0,0 +1,72 @@
+import torch
+from mteb.evaluation.evaluators import RetrievalEvaluator
+
+
+class CustomEvaluator:
+    def __init__(self, is_multi_vector=False):
+        self.is_multi_vector = is_multi_vector
+        self.mteb_evaluator = RetrievalEvaluator()
+
+    def evaluate(self, qs, ps):
+        if self.is_multi_vector:
+            scores = self.evaluate_colbert(qs, ps)
+        else:
+            scores = self.evaluate_biencoder(qs, ps)
+
+        assert scores.shape[0] == len(qs)
+
+        arg_score = scores.argmax(dim=1)
+        # compare to arange
+        accuracy = (arg_score == torch.arange(scores.shape[0], device=scores.device)).sum().item() / scores.shape[0]
+        print(arg_score)
+        print(f"Top 1 Accuracy (verif): {accuracy}")
+
+        # cast to numpy
+        # scores = scores.cpu().numpy()
+        scores = scores.to(torch.float32).cpu().numpy()
+        return scores
+
+    def compute_metrics(self, relevant_docs, results, **kwargs):
+        # wrap mteb package
+
+        ndcg, _map, recall, precision, naucs = self.mteb_evaluator.evaluate(
+            relevant_docs,
+            results,
+            self.mteb_evaluator.k_values,
+            ignore_identical_ids=kwargs.get("ignore_identical_ids", True),
+        )
+        mrr = self.mteb_evaluator.evaluate_custom(relevant_docs, results, self.mteb_evaluator.k_values, "mrr")
+        scores = {
+            **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()},
+            **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()},
+            **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()},
+            **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()},
+            **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr[0].items()},
+            **{f"naucs_at_{k.split('@')[1]}": v for (k, v) in naucs.items()},
+        }
+        return scores
+
+    def evaluate_colbert(self, qs, ps, batch_size=128) -> torch.Tensor:
+        scores = []
+        for i in range(0, len(qs), batch_size):
+            scores_batch = []
+            qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to(
+                "cpu"
+            )
+            for j in range(0, len(ps), batch_size):
+                ps_batch = torch.nn.utils.rnn.pad_sequence(
+                    ps[j : j + batch_size], batch_first=True, padding_value=0
+                ).to("cpu")
+                scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2))
+            scores_batch = torch.cat(scores_batch, dim=1).cpu()
+            scores.append(scores_batch)
+        scores = torch.cat(scores, dim=0)
+        return scores
+
+    def evaluate_biencoder(self, qs, ps) -> torch.Tensor:
+
+        qs = torch.stack(qs)
+        ps = torch.stack(ps)
+
+        scores = torch.einsum("bd,cd->bc", qs, ps)
+        return scores
diff --git a/colpali-main/colpali_engine/utils/__init__.py b/colpali-main/colpali_engine/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/colpali-main/colpali_engine/utils/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e656a8123fa410e78425c32f73002247613b9bb
Binary files /dev/null and b/colpali-main/colpali_engine/utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc b/colpali-main/colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f127c2e9ebbd0543ac577b9b4065471f78b6cbb
Binary files /dev/null and b/colpali-main/colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc b/colpali-main/colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bd4cd125a9f3eb8bc3d44ba713cc3273663d0b0
Binary files /dev/null and b/colpali-main/colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc differ
diff --git a/colpali-main/colpali_engine/utils/colidefics_processing_utils.py b/colpali-main/colpali_engine/utils/colidefics_processing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d19b4b145cb5409e71899265b8c2c1145d8a018
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/colidefics_processing_utils.py
@@ -0,0 +1,53 @@
+# Utils for processing images and queries for ColPaLi
+
+def process_images(processor, images, max_length: int = 50):
+    texts_doc = []
+    images = [image.convert("RGB") for image in images]
+
+    for _ in images:
+        messages_doc = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe the image."},
+                    {"type": "image"},
+                ],
+            },
+        ]
+
+        text_doc = processor.apply_chat_template(messages_doc, add_generation_prompt=False)
+        texts_doc.append(text_doc.strip())
+
+    batch_doc = processor(
+        text=texts_doc,
+        images=images,
+        return_tensors="pt",
+        padding="longest",
+    )
+    return batch_doc
+
+
+def process_queries(processor, queries, mock_image, max_length: int = 50):
+    texts_query = []
+    for query in queries:
+        messages_query = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": f"Question: {query}<end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance><end_of_utterance>",
+                    },
+                ],
+            },
+        ]
+        text_query = processor.apply_chat_template(messages_query, add_generation_prompt=False).strip()
+        texts_query.append(text_query)
+
+    batch_query = processor(
+        text=texts_query,
+        return_tensors="pt",
+        padding="longest",
+        max_length=max_length,
+    )
+    return batch_query
diff --git a/colpali-main/colpali_engine/utils/colpali_processing_utils.py b/colpali-main/colpali_engine/utils/colpali_processing_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d2e577f5f7899712db5c3ebeef37c531192e89a
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/colpali_processing_utils.py
@@ -0,0 +1,36 @@
+# Utils for processing images and queries for ColPaLi
+
+
+def process_images(processor, images, max_length: int = 50):
+    texts_doc = ["Describe the image."] * len(images)
+    images = [image.convert("RGB") for image in images]
+
+    batch_doc = processor(
+        text=texts_doc,
+        images=images,
+        return_tensors="pt",
+        padding="longest",
+        max_length=max_length + processor.image_seq_length,
+    )
+    return batch_doc
+
+
+def process_queries(processor, queries, mock_image, max_length: int = 50):
+    texts_query = []
+    for query in queries:
+        query = f"Question: {query}<unused0><unused0><unused0><unused0><unused0>"
+        texts_query.append(query)
+
+    batch_query = processor(
+        images=[mock_image.convert("RGB")] * len(texts_query),
+        # NOTE: the image is not used in batch_query but it is required for calling the processor
+        text=texts_query,
+        return_tensors="pt",
+        padding="longest",
+        max_length=max_length + processor.image_seq_length,
+    )
+    del batch_query["pixel_values"]
+
+    batch_query["input_ids"] = batch_query["input_ids"][..., processor.image_seq_length :]
+    batch_query["attention_mask"] = batch_query["attention_mask"][..., processor.image_seq_length :]
+    return batch_query
diff --git a/colpali-main/colpali_engine/utils/dataset_transformation.py b/colpali-main/colpali_engine/utils/dataset_transformation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a328198abdbf25312092371a5492b34bfe66fcd
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/dataset_transformation.py
@@ -0,0 +1,158 @@
+import os
+
+from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
+
+USE_LOCAL_DATASET = os.environ.get("USE_LOCAL_DATASET", "1") == "1"
+
+
+def add_metadata_column(dataset, column_name, value):
+    def add_source(example):
+        example[column_name] = value
+        return example
+
+    return dataset.map(add_source)
+
+
+def load_train_set() -> DatasetDict:
+
+    ds_paths = [
+        "infovqa_train",
+        "docvqa_train",
+        "arxivqa_train",
+        "tatdqa_train",
+        "syntheticDocQA_government_reports_train",
+        "syntheticDocQA_healthcare_industry_train",
+        "syntheticDocQA_artificial_intelligence_train",
+        "syntheticDocQA_energy_train",
+    ]
+    base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/"
+    ds_tot = []
+    for path in ds_paths:
+        cpath = base_path + path
+        ds = load_dataset(cpath, split="train")
+        if "arxivqa" in path:
+            # subsample 10k
+            ds = ds.shuffle(42).select(range(10000))
+        ds_tot.append(ds)
+
+    dataset = concatenate_datasets(ds_tot)
+    dataset = dataset.shuffle(seed=42)
+    # split into train and test
+    dataset_eval = dataset.select(range(500))
+    dataset = dataset.select(range(500, len(dataset)))
+    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
+    return ds_dict
+
+
+def load_train_set_with_tabfquad() -> DatasetDict:
+
+    ds_paths = [
+        "infovqa_train",
+        "docvqa_train",
+        "arxivqa_train",
+        "tatdqa_train",
+        "tabfquad_train_subsampled",
+        "syntheticDocQA_government_reports_train",
+        "syntheticDocQA_healthcare_industry_train",
+        "syntheticDocQA_artificial_intelligence_train",
+        "syntheticDocQA_energy_train",
+    ]
+    base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/"
+    ds_tot = []
+    for path in ds_paths:
+        cpath = base_path + path
+        ds = load_dataset(cpath, split="train")
+        if "arxivqa" in path:
+            # subsample 10k
+            ds = ds.shuffle(42).select(range(10000))
+        ds_tot.append(ds)
+
+    dataset = concatenate_datasets(ds_tot)
+    dataset = dataset.shuffle(seed=42)
+    # split into train and test
+    dataset_eval = dataset.select(range(500))
+    dataset = dataset.select(range(500, len(dataset)))
+    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
+    return ds_dict
+
+
+def load_train_set_with_docmatix() -> DatasetDict:
+
+    ds_paths = [
+        "infovqa_train",
+        "docvqa_train",
+        "arxivqa_train",
+        "tatdqa_train",
+        "tabfquad_train_subsampled",
+        "syntheticDocQA_government_reports_train",
+        "syntheticDocQA_healthcare_industry_train",
+        "syntheticDocQA_artificial_intelligence_train",
+        "syntheticDocQA_energy_train",
+        "Docmatix_filtered_train",
+    ]
+    base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/"
+    ds_tot = []
+    for path in ds_paths:
+        cpath = base_path + path
+        ds = load_dataset(cpath, split="train")
+        if "arxivqa" in path:
+            # subsample 10k
+            ds = ds.shuffle(42).select(range(10000))
+        ds_tot.append(ds)
+
+    dataset = concatenate_datasets(ds_tot)
+    dataset = dataset.shuffle(seed=42)
+    # split into train and test
+    dataset_eval = dataset.select(range(500))
+    dataset = dataset.select(range(500, len(dataset)))
+    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
+    return ds_dict
+
+
+def load_docvqa_dataset() -> DatasetDict:
+    if USE_LOCAL_DATASET:
+        dataset_doc = load_dataset("./data_dir/DocVQA", "DocVQA", split="validation")
+        dataset_doc_eval = load_dataset("./data_dir/DocVQA", "DocVQA", split="test")
+        dataset_info = load_dataset("./data_dir/DocVQA", "InfographicVQA", split="validation")
+        dataset_info_eval = load_dataset("./data_dir/DocVQA", "InfographicVQA", split="test")
+    else:
+        dataset_doc = load_dataset("lmms-lab/DocVQA", "DocVQA", split="validation")
+        dataset_doc_eval = load_dataset("lmms-lab/DocVQA", "DocVQA", split="test")
+        dataset_info = load_dataset("lmms-lab/DocVQA", "InfographicVQA", split="validation")
+        dataset_info_eval = load_dataset("lmms-lab/DocVQA", "InfographicVQA", split="test")
+
+    # concatenate the two datasets
+    dataset = concatenate_datasets([dataset_doc, dataset_info])
+    dataset_eval = concatenate_datasets([dataset_doc_eval, dataset_info_eval])
+    # sample 100 from eval dataset
+    dataset_eval = dataset_eval.shuffle(seed=42).select(range(200))
+
+    # rename question as query
+    dataset = dataset.rename_column("question", "query")
+    dataset_eval = dataset_eval.rename_column("question", "query")
+
+    # create new column image_filename that corresponds to ucsf_document_id if not None, else image_url
+    dataset = dataset.map(
+        lambda x: {"image_filename": x["ucsf_document_id"] if x["ucsf_document_id"] is not None else x["image_url"]}
+    )
+    dataset_eval = dataset_eval.map(
+        lambda x: {"image_filename": x["ucsf_document_id"] if x["ucsf_document_id"] is not None else x["image_url"]}
+    )
+
+    ds_dict = DatasetDict({"train": dataset, "test": dataset_eval})
+
+    return ds_dict
+
+
+class TestSetFactory:
+    def __init__(self, dataset_path):
+        self.dataset_path = dataset_path
+
+    def __call__(self, *args, **kwargs):
+        dataset = load_dataset(self.dataset_path, split="test")
+        return dataset
+
+
+if __name__ == "__main__":
+    ds = TestSetFactory("vidore/tabfquad_test_subsampled")()
+    print(ds)
diff --git a/colpali-main/colpali_engine/utils/gpu_stats.py b/colpali-main/colpali_engine/utils/gpu_stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d3efa36bd9c5ff781a9afd798bada3d095a538e
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/gpu_stats.py
@@ -0,0 +1,24 @@
+# cond import
+try:
+    from pynvml import *
+
+    def print_gpu_utilization():
+        nvmlInit()
+        handle = nvmlDeviceGetHandleByIndex(0)
+        info = nvmlDeviceGetMemoryInfo(handle)
+        print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")
+
+    def print_summary(result):
+        print(f"Time: {result.metrics['train_runtime']:.2f}")
+        print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+        print_gpu_utilization()
+
+except ImportError:
+    print("pynvml not found. GPU stats will not be printed.")
+
+    def print_summary(result):
+        print(f"Time: {result.metrics['train_runtime']:.2f}")
+        print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
+
+    def print_gpu_utilization():
+        pass
diff --git a/colpali-main/colpali_engine/utils/image_from_page_utils.py b/colpali-main/colpali_engine/utils/image_from_page_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bcc158b5ef1fe675a885a812f0abd2748101486
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/image_from_page_utils.py
@@ -0,0 +1,21 @@
+import requests
+from PIL import Image
+
+
+def load_from_pdf(pdf_path: str):
+    from pdf2image import convert_from_path
+
+    images = convert_from_path(pdf_path)
+    return images
+
+
+def load_from_image_urls(urls: str):
+    images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+    return images
+
+
+def load_from_dataset(dataset):
+    from datasets import load_dataset
+
+    dataset = load_dataset(dataset, split="test")
+    return dataset["image"]
diff --git a/colpali-main/colpali_engine/utils/image_utils.py b/colpali-main/colpali_engine/utils/image_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..61a31c236b33a65f8474363b3d058c6b29718482
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/image_utils.py
@@ -0,0 +1,64 @@
+"""
+Utility functions for working with images.
+"""
+
+import base64
+import io
+
+from PIL import Image
+
+
+def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image:
+    """
+    Scale an image to a new height while maintaining the aspect ratio.
+    """
+    # Calculate the scaling factor
+    width, height = image.size
+    aspect_ratio = width / height
+    new_width = int(new_height * aspect_ratio)
+
+    # Resize the image
+    scaled_image = image.resize((new_width, new_height))
+
+    return scaled_image
+
+
+def scale_to_max_dimension(image: Image.Image, max_dimension: int = 1024) -> Image.Image:
+    """
+    Scale an image to a maximum dimension while maintaining the aspect ratio.
+    """
+    # Get the dimensions of the image
+    width, height = image.size
+
+    max_original_dimension = max(width, height)
+
+    if max_original_dimension < max_dimension:
+        return image
+
+    # Calculate the scaling factor
+    aspect_ratio = max_dimension / max_original_dimension
+    new_width = int(width * aspect_ratio)
+    new_height = int(height * aspect_ratio)
+
+    # Resize the image
+    scaled_image = image.resize((new_width, new_height))
+
+    return scaled_image
+
+
+def get_base64_image(img: str | Image.Image, add_url_prefix: bool = True) -> str:
+    """
+    Convert an image (from a filepath or a PIL.Image object) to a JPEG-base64 string.
+    """
+    if isinstance(img, str):
+        img = Image.open(img)
+    elif isinstance(img, Image.Image):
+        pass
+    else:
+        raise ValueError("`img` must be a path to an image or a PIL Image object.")
+
+    buffered = io.BytesIO()
+    img.save(buffered, format="jpeg")
+    b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+    return f"data:image/jpeg;base64,{b64_data}" if add_url_prefix else b64_data
diff --git a/colpali-main/colpali_engine/utils/iter_utils.py b/colpali-main/colpali_engine/utils/iter_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bef1792045e5c973d07049e66838acc8f8bad7f6
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/iter_utils.py
@@ -0,0 +1,42 @@
+import sys
+
+
+def islice(iterable, *args):
+    """
+    Yield a slice of an iterable.
+    >>> islice('ABCDEFG', 2) → A B
+    >>> islice('ABCDEFG', 2, 4) → C D
+    >>> islice('ABCDEFG', 2, None) → C D E F G
+    >>> islice('ABCDEFG', 0, None, 2) → A C E G
+    """
+    s = slice(*args)
+    start, stop, step = s.start or 0, s.stop or sys.maxsize, s.step or 1
+    it = iter(range(start, stop, step))
+    try:
+        nexti = next(it)
+    except StopIteration:
+        # Consume *iterable* up to the *start* position.
+        for i, element in zip(range(start), iterable):
+            pass
+        return
+    try:
+        for i, element in enumerate(iterable):
+            if i == nexti:
+                yield element
+                nexti = next(it)
+    except StopIteration:
+        # Consume to *stop*.
+        for i, element in zip(range(i + 1, stop), iterable):
+            pass
+
+
+def batched(iterable, n: int):
+    """
+    Yield batches of n elements from an iterable.
+    >>> batched('ABCDEFG', 3) → ABC DEF G
+    """
+    if n < 1:
+        raise ValueError("n must be at least one")
+    it = iter(iterable)
+    while batch := tuple(islice(it, n)):
+        yield batch
diff --git a/colpali-main/colpali_engine/utils/pdf_utils.py b/colpali-main/colpali_engine/utils/pdf_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea07bec45b03afc48aec086a7a783437ececed75
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/pdf_utils.py
@@ -0,0 +1,87 @@
+import glob
+import os
+import random
+from pathlib import Path
+
+from pdf2image import convert_from_path
+from tqdm import tqdm
+
+random.seed(42)
+
+
+def convert_pdf_to_images(pdf_file: str, save_folder: str):
+    """
+    Convert each page of a pdf to a jpg image and save them in a folder.
+
+    Args:
+    - pdf_file (str): path to the pdf file
+    - save_folder (str): path to the folder where the images will be saved
+
+    """
+    images = convert_from_path(pdf_file)
+
+    for i, image in enumerate(images):
+        if not os.path.exists(save_folder):
+            os.makedirs(save_folder)
+        image.save(os.path.join(save_folder, f"page_{i+1}.jpg"), "JPEG")
+
+
+def convert_all_pdfs_to_images(path_to_folder: str, n_samples: int = 0):
+    """
+    Convert all pdfs in a folder and its subfolder to images and save them in a folder.
+    It will sample n_samples pdf files in each subfolder, allowing to have granularity on the number of pdf files to convert.
+
+
+    Args:
+    - path_to_folder (str): path to the folder containing the pdf files
+    - n_samples (int): number of pdf files to sample in each subfolder
+
+    directory structure:
+    - path_to_folder
+        - subfolder1
+            - pdf1
+            - pdf2
+            - ...
+        - subfolder2
+            - pdf1
+            - pdf2
+            - ...
+        - ...
+
+    """
+    # take n_samples pdf files in each subfolder : I want to take 10 pdf files from each subfolder
+    sub_dirs = [d for d in os.listdir(path_to_folder) if os.path.isdir(os.path.join(path_to_folder, d))]
+
+    sampled_files = []
+
+    for sub_dir in sub_dirs:
+        pdf_files = glob.glob(os.path.join(path_to_folder, sub_dir, "*.pdf"))
+
+        if (n_samples == 0) or (len(pdf_files) <= n_samples):
+            print(f"Taking all pdf files in {sub_dir}")
+            sampled_files.extend(pdf_files)
+
+        else:
+            print(f"Taking {n_samples} pdf files in {sub_dir}")
+            sampled_files.extend(random.sample(pdf_files, n_samples))
+
+    pdf_files = [str(file) for file in sampled_files]
+
+    # Create an empty text file that will contain the file paths of the corrupted pdf files
+    dirpath_corrupted = Path(path_to_folder) / "corrupted_pdf_files.txt"
+    dirpath_corrupted.parent.mkdir(parents=True, exist_ok=True)
+
+    with dirpath_corrupted.open("w") as f:
+        with tqdm(total=len(pdf_files)) as pbar:
+            for pdf_file in pdf_files:
+                pbar.set_description(f"Processing {pdf_file}")
+                save_folder = os.path.join("pages_extracted", *Path(pdf_file).parts[-2:])
+                if not os.path.exists(os.path.join(path_to_folder, save_folder)):
+                    try:
+                        convert_pdf_to_images(pdf_file, os.path.join(path_to_folder, save_folder))
+                    except Exception as e:
+                        print(f"Error converting {pdf_file}: {e}")
+                        f.write(pdf_file)
+                        f.write("\n")
+                pbar.update(1)
+    return
diff --git a/colpali-main/colpali_engine/utils/plot_utils.py b/colpali-main/colpali_engine/utils/plot_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e0c414ddca8cad08ae292f219eac43356008719
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/plot_utils.py
@@ -0,0 +1,6 @@
+import seaborn as sns
+
+
+def setup_seaborn():
+    sns.set_style("white")
+    sns.set_context("paper", font_scale=2)
diff --git a/colpali-main/colpali_engine/utils/torch_utils.py b/colpali-main/colpali_engine/utils/torch_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c6513327a5be87cdc4c3af877fa60a0e5d6dc8
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/torch_utils.py
@@ -0,0 +1,18 @@
+"""
+Utility functions for interpretability.
+"""
+
+import torch
+
+
+def get_torch_device() -> str:
+    """
+    Returns the device and dtype to be used for torch tensors.
+    """
+    if torch.cuda.is_available():
+        device = "cuda:0"
+    elif torch.backends.mps.is_available():  # for Apple Silicon
+        device = "mps"
+    else:
+        device = "cpu"
+    return device
diff --git a/colpali-main/colpali_engine/utils/train_colpali_engine_models.py b/colpali-main/colpali_engine/utils/train_colpali_engine_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..65555273f80d56f495d1e0356edd6c21d0a63ff8
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/train_colpali_engine_models.py
@@ -0,0 +1,247 @@
+# HuggingFace trainer
+import json
+import os
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional
+
+import torch
+from datasets import concatenate_datasets
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoTokenizer, Idefics2Processor, PreTrainedModel, PreTrainedTokenizer, TrainingArguments
+
+from colpali_engine.dataset.custom_collator import CustomCollator
+from colpali_engine.loss.colbert_loss import BiEncoderLoss, BiPairwiseCELoss, ColbertLoss, ColbertPairwiseCELoss
+from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer
+from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator
+from colpali_engine.utils.gpu_stats import print_gpu_utilization, print_summary
+
+
+@dataclass
+class ColModelTrainingConfig:
+    model: PreTrainedModel
+    tr_args: TrainingArguments = None
+    output_dir: str = None
+    max_length: int = 256
+    run_eval: bool = True
+    run_train: bool = True
+    peft_config: Optional[LoraConfig] = None
+    add_suffix: bool = False
+    processor: Idefics2Processor = None
+    tokenizer: PreTrainedTokenizer = None
+    loss_func: Optional[Callable] = ColbertLoss()
+    dataset_loading_func: Optional[Callable] = None
+    eval_dataset_loader: Optional[Dict[str, Callable]] = None
+    pretrained_peft_model_name_or_path: Optional[str] = None
+
+    def __post_init__(self):
+        if self.output_dir is None:
+            sanitized_name = str(self.model.name_or_path).replace("/", "_")
+            self.output_dir = f"./models/{sanitized_name}"
+
+        if self.tr_args is None:
+            self.tr_args = TrainingArguments(output_dir=self.output_dir)
+        elif self.tr_args.output_dir is None:
+            self.tr_args.output_dir = self.output_dir
+
+        # cast if string
+        if isinstance(self.tr_args.learning_rate, str):
+            self.tr_args.learning_rate = float(self.tr_args.learning_rate)
+        self.tr_args.remove_unused_columns = False
+
+        if self.processor is None and self.tokenizer is None:
+            print("Using textual model tokenization")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model.name_or_path)
+
+        if self.pretrained_peft_model_name_or_path is not None:
+            self.model.load_adapter(self.pretrained_peft_model_name_or_path)
+            print(f"Loaded pretrained adapter from {self.pretrained_peft_model_name_or_path}")
+
+        if self.peft_config is not None:
+            print("Configurating PEFT model")
+            if self.processor is None:
+                # Might be deprecated - use the "else" branch
+                self.model = prepare_model_for_kbit_training(self.model)  # use_gradient_checkpointing=True
+                # self.model.enable_input_require_grads()
+                self.model = get_peft_model(self.model, self.peft_config)
+                self.model.print_trainable_parameters()
+            else:
+                # Ugly debugging hack
+                # if self.model.model.config.text_config.vocab_size == 32000:
+                #     print("DEBUG: Resizing token embeddings - This should not happen in a real scenario!")
+                #     self.model.model.text_model.resize_token_embeddings(32003)
+                #     self.model.model.vision_model.encoder.layers = self.model.model.vision_model.encoder.layers[0:2]
+                # self.model.enable_input_require_grads()
+                if self.pretrained_peft_model_name_or_path is None:
+                    self.model.add_adapter(self.peft_config)
+                    self.model.enable_adapters()
+                else:
+                    print(f"Adapter already loaded from {self.pretrained_peft_model_name_or_path}. Not overwriting.")
+
+    print_gpu_utilization()
+
+
+class ColModelTraining:
+    def __init__(self, config: ColModelTrainingConfig) -> None:
+        self.config = config
+        self.model = self.config.model
+        self.dataset = self.config.dataset_loading_func()
+        self.collator = CustomCollator(
+            processor=self.config.processor, tokenizer=self.config.tokenizer, max_length=self.config.max_length
+        )
+        self.current_git_hash = os.popen("git rev-parse HEAD").read().strip()
+        self.retriever_evaluator = CustomEvaluator(
+            is_multi_vector=(
+                isinstance(self.config.loss_func, ColbertLoss)
+                or isinstance(self.config.loss_func, ColbertPairwiseCELoss)
+            )
+        )
+
+    def train(self) -> None:
+
+        trainer = ContrastiveTrainer(
+            model=self.model,
+            train_dataset=self.dataset["train"],
+            eval_dataset=self.dataset["test"],
+            args=self.config.tr_args,
+            data_collator=self.collator,
+            loss_func=self.config.loss_func,
+            is_vision_model=self.config.processor is not None,
+        )
+        trainer.args.remove_unused_columns = False
+
+        result = trainer.train()
+        print_summary(result)
+
+    def eval_dataset(self, test_dataset):
+
+        self.model.eval()
+
+        # # debug
+        # if len(test_dataset) > 200:
+        #     test_dataset = test_dataset.select(range(0, 100))
+
+        idx_with_query = [idx for idx, sample in enumerate(test_dataset["query"]) if sample is not None]
+        idx_without_query = [idx for idx, sample in enumerate(test_dataset["query"]) if sample is None]
+
+        dataloader_with_query = DataLoader(
+            test_dataset.select(idx_with_query),
+            batch_size=self.config.tr_args.per_device_eval_batch_size,
+            shuffle=False,
+            collate_fn=self.collator,
+        )
+        dataloader_without_query = DataLoader(
+            test_dataset.select(idx_without_query),
+            batch_size=self.config.tr_args.per_device_eval_batch_size,
+            shuffle=False,
+            collate_fn=self.collator,
+        )
+
+        # dataset is ordered so that non-null queries come first
+        test_dataset = concatenate_datasets(
+            [test_dataset.select(idx_with_query), test_dataset.select(idx_without_query)]
+        )
+
+        relevant_docs = {}
+        docidx_2_docid = {}
+        qsidx_2_query = []
+        for idx, sample in enumerate(test_dataset):
+            doc_id = sample["image_filename"] if "image_filename" in sample else str(hash(sample["doc"]))
+            # query_id = sample["query_id"] if "query_id" in sample else str(hash(sample["query"]))
+            if sample["query"] is not None:
+                relevant_docs[str(idx)] = {doc_id: 1}
+                qsidx_2_query.append(str(idx))
+            docidx_2_docid[str(idx)] = doc_id
+
+        qs = []
+        ps = []
+
+        device = self.model.device
+        with (torch.no_grad()):
+            for dataloader in [dataloader_with_query, dataloader_without_query]:
+                for batch in tqdm(dataloader):
+                    if "doc_pixel_values" not in batch:
+                        doc = self.model(
+                            input_ids=batch["doc_input_ids"].to(device),
+                            attention_mask=batch["doc_attention_mask"].to(device),
+                        )
+
+                    else:
+                        if "doc_pixel_attention_mask" in batch:
+                            doc = self.model(
+                                input_ids=batch["doc_input_ids"].to(device),
+                                attention_mask=batch["doc_attention_mask"].to(device),
+                                pixel_values=batch["doc_pixel_values"].to(device),
+                                pixel_attention_mask=batch["doc_pixel_attention_mask"].to(device),
+                            )
+                        else:
+                            doc = self.model(
+                                input_ids=batch["doc_input_ids"].to(device),
+                                attention_mask=batch["doc_attention_mask"].to(device),
+                                pixel_values=batch["doc_pixel_values"].to(device),
+                            )
+
+                    ps.extend(list(torch.unbind(doc.to("cpu"))))
+
+                    if "query_input_ids" in batch:
+                        query = self.model(
+                            input_ids=batch["query_input_ids"].to(device),
+                            attention_mask=batch["query_attention_mask"].to(device),
+                        )
+                        # variable len
+                        qs.extend(list(torch.unbind(query.to("cpu"))))
+
+        print("Embeddings computed, evaluating")
+        scores = self.retriever_evaluator.evaluate(qs, ps)
+        # scores is 2d array of shape (n_queries, n_docs)
+        # turn it into a dict
+        results = {}
+        assert scores.shape[0] == len(qsidx_2_query)
+        for idx, scores_per_query in enumerate(scores):
+            results[qsidx_2_query[idx]] = {
+                docidx_2_docid[str(docidx)]: float(score) for docidx, score in enumerate(scores_per_query)
+            }
+
+        # evaluate
+        metrics = self.retriever_evaluator.compute_metrics(relevant_docs, results)
+        print(metrics)
+        return metrics
+
+    def eval(self) -> None:
+
+        print("Evaluating on validation set")
+        metrics = self.eval_dataset(self.dataset["test"])
+        print(f"Metrics for validation set: {metrics}")
+        all_metrics = {"validation_set": metrics}
+
+        if self.config.eval_dataset_loader is not None:
+            for test_name, test_dataset_loading_func in self.config.eval_dataset_loader.items():
+                print(f"Evaluating {test_name}")
+                test_ds = test_dataset_loading_func()
+                metrics = self.eval_dataset(test_ds)
+                all_metrics[test_name] = metrics
+                print(f"Metrics for {test_name}: {metrics}")
+
+                # checkpoint dumps
+                with open(f"{self.config.output_dir}/results.json", "w") as f:
+                    json.dump(all_metrics, f)
+
+        # save results as json
+        with open(f"{self.config.output_dir}/results.json", "w") as f:
+            json.dump(all_metrics, f)
+
+    def save(self, config_file):
+        # save model
+        self.model.save_pretrained(self.config.output_dir)
+        if self.config.tokenizer is not None:
+            self.config.tokenizer.save_pretrained(self.config.output_dir)
+        if self.config.processor is not None:
+            self.config.processor.save_pretrained(self.config.output_dir)  # save config
+
+        # copy-paste the yml file with os
+        os.system(f"cp {config_file} {self.config.output_dir}/training_config.yml")
+
+        # save git hash of the commit at beginning of training
+        with open(f"{self.config.output_dir}/git_hash.txt", "w") as f:
+            f.write(self.current_git_hash)
diff --git a/colpali-main/colpali_engine/utils/wrapper.py b/colpali-main/colpali_engine/utils/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe6202858c271c17eccb5b761550682270749aee
--- /dev/null
+++ b/colpali-main/colpali_engine/utils/wrapper.py
@@ -0,0 +1,83 @@
+import importlib
+
+from colpali_engine.models.clip_baselines import ColSigLIP, SigLIP
+from colpali_engine.models.colbert_architectures import (
+    BiBERT,
+    BiXLMRoBERTa,
+    ColBERT,
+    ColCamembert,
+    ColLlama,
+    ColXLMRoBERTa,
+)
+from colpali_engine.models.idefics_colbert_architecture import BiIdefics, ColIdefics
+from colpali_engine.models.paligemma_colbert_architecture import (
+    BiNewSiglip,
+    BiPaliLast,
+    BiPaliMean,
+    ColNewSiglip,
+    ColPali,
+)
+
+if importlib.util.find_spec("transformers") is not None:
+    from transformers import AutoProcessor, AutoTokenizer
+    from transformers.tokenization_utils import PreTrainedTokenizer
+
+    class AutoProcessorWrapper:
+        def __new__(cls, *args, **kwargs):
+            return AutoProcessor.from_pretrained(*args, **kwargs)
+
+    class AutoTokenizerWrapper(PreTrainedTokenizer):
+        def __new__(cls, *args, **kwargs):
+            return AutoTokenizer.from_pretrained(*args, **kwargs)
+
+    class AutoColModelWrapper:
+        def __new__(cls, *args, **kwargs):
+            pretrained_model_name_or_path = None
+            if args:
+                pretrained_model_name_or_path = args[0]
+            elif kwargs:
+                pretrained_model_name_or_path = kwargs["pretrained_model_name_or_path"]
+
+            training_objective = kwargs.pop("training_objective", "colbertv1")
+
+            if "camembert" in pretrained_model_name_or_path:
+                return ColCamembert.from_pretrained(*args, **kwargs)
+            elif "xlm-roberta" in pretrained_model_name_or_path:
+                if training_objective == "biencoder":
+                    return BiXLMRoBERTa.from_pretrained(*args, **kwargs)
+                return ColXLMRoBERTa.from_pretrained(*args, **kwargs)
+            elif (
+                "llama" in pretrained_model_name_or_path.lower() or "croissant" in pretrained_model_name_or_path.lower()
+            ):
+                return ColLlama.from_pretrained(*args, **kwargs)
+            elif "idefics2" in pretrained_model_name_or_path:
+                if training_objective == "biencoder":
+                    return BiIdefics.from_pretrained(*args, **kwargs)
+                return ColIdefics.from_pretrained(*args, **kwargs)
+            elif "siglip" in pretrained_model_name_or_path:
+                if training_objective == "biencoder_mean":
+                    return SigLIP.from_pretrained(*args, **kwargs)
+                elif training_objective == "colbertv1":
+                    return ColSigLIP.from_pretrained(*args, **kwargs)
+                else:
+                    raise ValueError(f"Training objective {training_objective} not recognized")
+            elif "paligemma" in pretrained_model_name_or_path:
+                if training_objective == "biencoder_mean":
+                    return BiPaliMean.from_pretrained(*args, **kwargs)
+                elif training_objective == "biencoder_last":
+                    return BiPaliLast.from_pretrained(*args, **kwargs)
+                elif training_objective == "biencoder_mean_vision":
+                    return BiNewSiglip.from_pretrained(*args, **kwargs)
+                elif training_objective == "colbertv1_vision":
+                    return ColNewSiglip.from_pretrained(*args, **kwargs)
+                elif training_objective == "colbertv1":
+                    return ColPali.from_pretrained(*args, **kwargs)
+                else:
+                    raise ValueError(f"Training objective {training_objective} not recognized")
+            else:
+                if training_objective == "biencoder":
+                    return BiBERT.from_pretrained(*args, **kwargs)
+                return ColBERT.from_pretrained(*args, **kwargs)
+
+else:
+    raise ModuleNotFoundError("Transformers must be loaded")
diff --git a/colpali-main/demo/README.md b/colpali-main/demo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..037b3f142a19eff4ed1c6eb2493b9281170c9dd8
--- /dev/null
+++ b/colpali-main/demo/README.md
@@ -0,0 +1,6 @@
+---
+title: cvquest-colpali
+app_file: app.py
+sdk: gradio
+sdk_version: 4.39.0
+---
diff --git a/colpali-main/demo/app.py b/colpali-main/demo/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..a64a491875b0e2afe815a0166efdd6b7ecf22594
--- /dev/null
+++ b/colpali-main/demo/app.py
@@ -0,0 +1,99 @@
+import os
+import sys
+
+import gradio as gr
+import torch
+from pdf2image import convert_from_path
+from PIL import Image
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoProcessor
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from colpali_engine.models.paligemma_colbert_architecture import ColPali
+from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator
+from colpali_engine.utils.colpali_processing_utils import process_images, process_queries
+
+def search(query: str, ds, images, k):
+    qs = []
+    with torch.no_grad():
+        batch_query = process_queries(processor, [query], mock_image)
+        batch_query = {k: v.to(device) for k, v in batch_query.items()}
+        embeddings_query = model(**batch_query)
+        qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
+
+    # run evaluation
+    retriever_evaluator = CustomEvaluator(is_multi_vector=True)
+    scores = retriever_evaluator.evaluate(qs, ds)
+
+    top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
+
+    results = []
+    for idx in top_k_indices:
+        results.append((images[idx], f"Page {idx}"))
+
+    return results
+
+    #best_page = int(scores.argmax(axis=1).item())
+    #return f"The most relevant page is {best_page}", images[best_page]
+
+
+def index(file, ds):
+    """Example script to run inference with ColPali"""
+    images = []
+    for f in file:
+        images.extend(convert_from_path(f))
+
+    # run inference - docs
+    dataloader = DataLoader(
+        images,
+        batch_size=4,
+        shuffle=False,
+        collate_fn=lambda x: process_images(processor, x),
+    )
+    for batch_doc in tqdm(dataloader):
+        with torch.no_grad():
+            batch_doc = {k: v.to(device) for k, v in batch_doc.items()}
+            embeddings_doc = model(**batch_doc)
+        ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
+    return f"Uploaded and converted {len(images)} pages", ds, images
+
+COLORS = ["#4285f4", "#db4437", "#f4b400", "#0f9d58", "#e48ef1"]
+# Load model
+model_name = "vidore/colpali"
+token = os.environ.get("HF_TOKEN")
+model = ColPali.from_pretrained(
+    "google/paligemma-3b-mix-448", torch_dtype=torch.bfloat16, device_map="cpu", token=token
+).eval()
+model.load_adapter(model_name)
+processor = AutoProcessor.from_pretrained(model_name, token=token)
+device = model.device
+mock_image = Image.new("RGB", (448, 448), (255, 255, 255))
+
+with gr.Blocks() as demo:
+    gr.Markdown("# ColPali: Efficient Document Retrieval with Vision Language Models 📚🔍")
+    gr.Markdown("## 1️⃣ Upload PDFs")
+    file = gr.File(file_types=["pdf"], file_count="multiple")
+
+    gr.Markdown("## 2️⃣ Convert the PDFs and upload")
+    convert_button = gr.Button("🔄 Convert and upload")
+    message = gr.Textbox("Files not yet uploaded")
+    embeds = gr.State(value=[])
+    imgs = gr.State(value=[])
+
+    # Define the actions
+    convert_button.click(index, inputs=[file, embeds], outputs=[message, embeds, imgs])
+
+    gr.Markdown("## 3️⃣ Search")
+    query = gr.Textbox(placeholder="Enter your query here")
+    search_button = gr.Button("🔍 Search")
+    message2 = gr.Textbox("Query not yet set")
+    output_img = gr.Image()
+    k = gr.Slider(minimum=1, maximum=10, step=1, label="Number of results", value=5)
+
+    search_button.click(search, inputs=[query, embeds, imgs, k], outputs=[message2, output_img])
+
+
+if __name__ == "__main__":
+    demo.queue(max_size=10).launch(debug=True)
diff --git a/colpali-main/pyproject.toml b/colpali-main/pyproject.toml
new file mode 100644
index 0000000000000000000000000000000000000000..66159e55011f2270865022d484424d7bee4766dd
--- /dev/null
+++ b/colpali-main/pyproject.toml
@@ -0,0 +1,105 @@
+[project]
+name = 'colpali_engine'
+# dynamic = ["version"]
+version = '0.0.1'
+description = 'This repository centralizes ressources for the ColPali project.'
+authors = [
+    { name = 'Manuel Faysse', email = 'manuel.faysse@illuin.tech' },
+    { name = 'Hugues Sibille', email = 'hugues.sibille@illuin.tech' },
+    { name = 'Tony Wu', email = 'tony.wu@illuin.tech' },
+]
+readme = 'README.md'
+requires-python = '>=3.9'
+classifiers = [
+    'Intended Audience :: Science/Research',
+    'Intended Audience :: Developers',
+    'Operating System :: OS Independent',
+    'Private :: Do Not Upload',
+    'Programming Language :: Python :: 3',
+    'Programming Language :: Python :: 3.10',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'Typing :: Typed',
+]
+dependencies = [
+    "torch>=2.2.0",
+    "transformers>=4.41.1",
+    "mteb>=1.12.22",
+    "requests",
+    "GPUtil",
+    "peft>=0.11.0, <0.12.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "black>=24.4.2",
+    "coverage>=7.5.2",
+    "ipykernel>=6.29.4",
+    "mypy>=1.10.0",
+    "pytest>=8.2.1",
+    "ruff>=0.4.5",
+]
+
+train = [
+    "accelerate==0.30.1",
+    "configue==5.0.0",
+    "datasets==2.19.1",
+    "typer==0.12.3",
+    "bitsandbytes",
+]
+
+[project.urls]
+homepage = "https://github.com/ManuelFay/colpali"
+
+[build-system]
+requires = ['setuptools', 'setuptools_scm[toml]', 'wheel']
+build-backend = 'setuptools.build_meta'
+
+[tool.setuptools_scm]
+fallback_version = '0.0.0-dev'
+
+[tool.setuptools]
+zip-safe = false
+platforms = ['any']
+
+[tool.setuptools.packages.find]
+include = ['colpali_engine', 'colpali_engine.*']
+
+[tool.mypy]
+check_untyped_defs = true
+disallow_untyped_defs = true
+enable_error_code = ['ignore-without-code']
+exclude = ['docs/']
+mypy_path = '$MYPY_CONFIG_FILE_DIR/typings'
+no_implicit_optional = true
+show_error_codes = true
+warn_redundant_casts = true
+warn_return_any = true
+warn_unused_configs = true
+warn_unused_ignores = true
+warn_unreachable = true
+
+[[tool.mypy.overrides]]
+module = ['transformers', 'transformers.*', 'torch', 'torch.*']
+ignore_missing_imports = true
+
+[tool.coverage.run]
+include = []
+
+[tool.coverage.report]
+exclude_lines = [
+    'pragma: no cover',
+    'raise NotImplementedError',
+    'if __name__ == "__main__":',
+    'if TYPE_CHECKING:',
+    'def __repr__',
+]
+
+[tool.black]
+line-length = 120
+
+[tool.ruff]
+select = ["E", "F", "W", "I", "N"]
+line-length = 120
+
+[tool.ruff.per-file-ignores]
+'__init__.py' = ["F401"]
diff --git a/colpali-main/requirements.txt b/colpali-main/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..49e9854a685c9bd586746ea5d012092f30d35c9c
--- /dev/null
+++ b/colpali-main/requirements.txt
@@ -0,0 +1,7 @@
+-e .
+
+black
+pytest
+pdf2image
+Pillow
+typer
\ No newline at end of file
diff --git a/colpali-main/scripts/configs/data/debug_data.yaml b/colpali-main/scripts/configs/data/debug_data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6988d4b20c321ca7d0530ac3211ca4971dfe4f88
--- /dev/null
+++ b/colpali-main/scripts/configs/data/debug_data.yaml
@@ -0,0 +1,3 @@
+syntheticDocQA_energy:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: vidore/syntheticDocQA_energy_test
\ No newline at end of file
diff --git a/colpali-main/scripts/configs/data/test_data.yaml b/colpali-main/scripts/configs/data/test_data.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..edc14c51687bbeeb7f51e695fc923ce7ccb52d3c
--- /dev/null
+++ b/colpali-main/scripts/configs/data/test_data.yaml
@@ -0,0 +1,31 @@
+# eval_dataset_loader:
+syntheticDocQA_energy:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/syntheticDocQA_energy_test
+syntheticDocQA_healthcare_industry:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/syntheticDocQA_healthcare_industry_test
+syntheticDocQA_artificial_intelligence_test:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/syntheticDocQA_artificial_intelligence_test
+syntheticDocQA_government_reports:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/syntheticDocQA_government_reports_test
+infovqa_subsampled:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/infovqa_test_subsampled
+docvqa_subsampled:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/docvqa_test_subsampled
+arxivqa_subsampled:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/arxivqa_test_subsampled
+tabfquad_subsampled:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/tabfquad_test_subsampled
+tatdqa:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/tatdqa_test
+shift_project:
+  (): colpali_engine.utils.dataset_transformation.TestSetFactory
+  dataset_path: !path ../../../data_dir/shiftproject_test
\ No newline at end of file
diff --git a/colpali-main/scripts/configs/idefics/eval_colidefics_model.yaml b/colpali-main/scripts/configs/idefics/eval_colidefics_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f84ca4576d791cd7add43d423ce206e055116ae
--- /dev/null
+++ b/colpali-main/scripts/configs/idefics/eval_colidefics_model.yaml
@@ -0,0 +1,43 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  processor:
+    (): colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b"
+    do_image_splitting: false
+  pretrained_peft_model_name_or_path: !path ../../../models/HuggingFaceM4_idefics2-8b-chatty
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b-chatty"
+    training_objective: "colbertv1"
+    # attn_implementation: "flash_attention_2"
+    torch_dtype:  !ext torch.bfloat16
+    quantization_config:
+      (): transformers.BitsAndBytesConfig
+      load_in_4bit: true
+      bnb_4bit_quant_type: "nf4"
+      bnb_4bit_compute_dtype:  "float16"
+      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_tabfquad_retrieving
+  max_length: 256
+  run_eval: true
+  run_train: false
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 4
+    eval_strategy: "steps"
+    dataloader_num_workers: 8
+    # bf16: true
+    save_steps: 500
+    logging_steps: 10
+    eval_steps: 50
+    warmup_steps: 100
+    learning_rate: 5e-5
+    save_total_limit: 1
diff --git a/colpali-main/scripts/configs/idefics/train_biidefics_model.yaml b/colpali-main/scripts/configs/idefics/train_biidefics_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ac3bc492d08db46c165988d3aff6205b7ebbce6
--- /dev/null
+++ b/colpali-main/scripts/configs/idefics/train_biidefics_model.yaml
@@ -0,0 +1,52 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/biidefics2-8b-chatty
+  processor:
+    (): colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b"
+    do_image_splitting: false
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b-chatty"
+    training_objective: "biencoder"
+    # attn_implementation: "flash_attention_2"
+    torch_dtype:  !ext torch.bfloat16
+    quantization_config:
+      (): transformers.BitsAndBytesConfig
+      load_in_4bit: true
+      bnb_4bit_quant_type: "nf4"
+      bnb_4bit_compute_dtype:  "float16"
+      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset
+  max_length: 256
+  run_eval: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 4
+    eval_strategy: "steps"
+    dataloader_num_workers: 8
+    # bf16: true
+    save_steps: 500
+    logging_steps: 10
+    eval_steps: 50
+    warmup_steps: 100
+    learning_rate: 5e-5
+    save_total_limit: 1
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 8
+    lora_alpha: 8
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
diff --git a/colpali-main/scripts/configs/idefics/train_colidefics2_model.yaml b/colpali-main/scripts/configs/idefics/train_colidefics2_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0597093d17b751d433282e94ff1df603586a2a60
--- /dev/null
+++ b/colpali-main/scripts/configs/idefics/train_colidefics2_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colidefics2-60
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/idefics2-8b"
+    do_image_splitting: false
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/idefics2-8b"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/idefics/train_colidefics_model.yaml b/colpali-main/scripts/configs/idefics/train_colidefics_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bfe68cb0e8a54a55ca26922458d3a1caf2ff44b4
--- /dev/null
+++ b/colpali-main/scripts/configs/idefics/train_colidefics_model.yaml
@@ -0,0 +1,54 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/colidefics2-8b-chatty-long
+  processor:
+    (): colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b"
+    do_image_splitting: false
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b-chatty"
+    training_objective: "colbertv1"
+    # attn_implementation: "flash_attention_2"
+    torch_dtype:  !ext torch.bfloat16
+    device_map: "auto"
+    quantization_config:
+      (): transformers.BitsAndBytesConfig
+      load_in_4bit: true
+      bnb_4bit_quant_type: "nf4"
+      bnb_4bit_compute_dtype:  "bfloat16"
+      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset
+  max_length: 380
+  run_eval: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 4
+    gradient_accumulation_steps: 8
+    per_device_eval_batch_size: 4
+    eval_strategy: "steps"
+    dataloader_num_workers: 8
+    # bf16: true
+    # gradient_checkpointing: true
+    save_steps: 500
+    logging_steps: 10
+    eval_steps: 50
+    warmup_steps: 100
+    learning_rate: 5e-5
+    save_total_limit: 1
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 8
+    lora_alpha: 8
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
diff --git a/colpali-main/scripts/configs/idefics/train_colidefics_model_debug.yaml b/colpali-main/scripts/configs/idefics/train_colidefics_model_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..adb532d0efc349eba18661206fd5340fca50f627
--- /dev/null
+++ b/colpali-main/scripts/configs/idefics/train_colidefics_model_debug.yaml
@@ -0,0 +1,54 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b"
+    do_image_splitting: false
+    max_length: 256
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "dondosss/tiny-random-idefics2"
+    training_objective: "biencoder"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.float16
+    quantization_config:
+      (): transformers.BitsAndBytesConfig
+      load_in_4bit: true
+      bnb_4bit_quant_type: "nf4"
+      bnb_4bit_compute_dtype:  "float16"
+      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset
+  max_length: 256
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 1
+    gradient_accumulation_steps: 2
+    per_device_eval_batch_size: 1
+    eval_strategy: "steps"
+    dataloader_num_workers: 8
+    max_steps: 20
+    bf16: false
+    save_steps: 500
+    logging_steps: 10
+    eval_steps: 10
+    warmup_steps: 500
+    learning_rate: 5e-5
+    save_total_limit: 1
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 8
+    lora_alpha: 8
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
diff --git a/colpali-main/scripts/configs/pali/eval_bipali_model.yaml b/colpali-main/scripts/configs/pali/eval_bipali_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6b92ba71a8bd5c55be0efd7ca7a659cc2e55414f
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/eval_bipali_model.yaml
@@ -0,0 +1,31 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/eval_v0_bipali_mean-3b-mix-448
+  pretrained_peft_model_name_or_path: !path ../../../models/train_v0_bipali_mean-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "biencoder_mean"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiPairwiseCELoss
+  tr_args: !import ../tr_args/eval_tr_args.yaml
diff --git a/colpali-main/scripts/configs/pali/eval_bisiglip_model.yaml b/colpali-main/scripts/configs/pali/eval_bisiglip_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3a5deb76c7454ee576a9c3327536e37ee5dd5685
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/eval_bisiglip_model.yaml
@@ -0,0 +1,33 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/eval_v0_bisiglipnew-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  pretrained_peft_model_name_or_path: !path ../../../models/train_v0_bisiglipnew-3b-mix-448
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "biencoder_mean_vision"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  run_train: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiPairwiseCELoss
+  tr_args: !import ../tr_args/eval_tr_args.yaml
+
diff --git a/colpali-main/scripts/configs/pali/eval_colpali_model.yaml b/colpali-main/scripts/configs/pali/eval_colpali_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7097bf0f4f25aac0ffca07fbf5c5f7ed275e028b
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/eval_colpali_model.yaml
@@ -0,0 +1,33 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/results/evals/eval_colpali-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  pretrained_peft_model_name_or_path: !path ../../../models/results/without_tabfquad/train_colpali-3b-mix-448
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  run_train: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/eval_tr_args.yaml
+
diff --git a/colpali-main/scripts/configs/pali/eval_colsiglip_model.yaml b/colpali-main/scripts/configs/pali/eval_colsiglip_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9f91c05ecb73ba1781f7cbebb0816aa70bf33968
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/eval_colsiglip_model.yaml
@@ -0,0 +1,33 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/eval_v0_colsiglipnew-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  pretrained_peft_model_name_or_path: !path ../../../models/train_v0_colsiglipnew-3b-mix-448
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  run_train: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/eval_tr_args.yaml
+
diff --git a/colpali-main/scripts/configs/pali/train_bipali_all_model.yaml b/colpali-main/scripts/configs/pali/train_bipali_all_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d59853840cbaefc0d65efb47aed8481efd3d6487
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_bipali_all_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_bipali_all_mean-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "biencoder_mean"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model|vision_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(multi_modal_projector\.linear).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
+
diff --git a/colpali-main/scripts/configs/pali/train_bipali_model.yaml b/colpali-main/scripts/configs/pali/train_bipali_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2a3280d56dbe1a00eb4b898e970079f46737231a
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_bipali_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_bipali_mean-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "biencoder_mean"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$'
+
diff --git a/colpali-main/scripts/configs/pali/train_bisiglip_new_model.yaml b/colpali-main/scripts/configs/pali/train_bisiglip_new_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..190b2866aec93d267dd6e5995287de491010012f
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_bisiglip_new_model.yaml
@@ -0,0 +1,38 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_bisiglip_new-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "biencoder_mean_vision"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)'
diff --git a/colpali-main/scripts/configs/pali/train_colpali_224_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_224_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1ac7db622d08838b37dfb8e8dce230aaf62ecf1
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_224_model.yaml
@@ -0,0 +1,42 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-ft-ocrvqa-224
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-224"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-224"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    # attn_implementation: "flash_attention_2"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colpali_448_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_448_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ed901b78d1029b39a69be568c9056c1ab32d3640
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_448_model.yaml
@@ -0,0 +1,42 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-ft-ocrvqa-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    # attn_implementation: "flash_attention_2"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colpali_896_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_896_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..014a4a59ba3c8e2e48ed832338789ff942357385
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_896_model.yaml
@@ -0,0 +1,42 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-ft-ocrvqa-896
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-896"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-896"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    # attn_implementation: "flash_attention_2"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colpali_all_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_all_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..dc7a81dab3c2e8db597f72e0842148f1e85aa617
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_all_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_colpali_all-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model|vision_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(multi_modal_projector\.linear).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colpali_docmatix_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_docmatix_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7e2a4e481dca0c630339cdcd3f935db48cbd7d46
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_docmatix_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/train_colpali-docmatix-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set_with_docmatix
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colpali_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b7de6678594fcb52d5d52801402cae3d1edda153
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colpali_pt_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_pt_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c3950e215ea1a7e464e27b73ceb687dc521a2cf9
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colpali_pt_model.yaml
@@ -0,0 +1,41 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-pt-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-pt-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-pt-448"
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+    # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/pali/train_colsiglip_new_model.yaml b/colpali-main/scripts/configs/pali/train_colsiglip_new_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d1be827506a5dfb46b8dc5371a85a8f8b0484ca7
--- /dev/null
+++ b/colpali-main/scripts/configs/pali/train_colsiglip_new_model.yaml
@@ -0,0 +1,38 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_colsiglip_new-3b-mix-448
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    max_length: 50
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "./models/paligemma-3b-mix-448"
+    training_objective: "colbertv1_vision"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+  max_length: 50
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)'
diff --git a/colpali-main/scripts/configs/siglip/eval_bisiglip_model.yaml b/colpali-main/scripts/configs/siglip/eval_bisiglip_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6f0913ca7f6f2434d959e8dc801fb7f3540ac73
--- /dev/null
+++ b/colpali-main/scripts/configs/siglip/eval_bisiglip_model.yaml
@@ -0,0 +1,40 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/eval_real_siglip_untrained
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384
+    max_length: 64
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384
+    training_objective: "biencoder_mean"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 64
+  run_train: true
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiPairwiseCELoss
+  tr_args: !import ../tr_args/eval_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)'
diff --git a/colpali-main/scripts/configs/siglip/train_bisiglip_model.yaml b/colpali-main/scripts/configs/siglip/train_bisiglip_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7c0d0405d4f0c4ba186a173ad48ae4c896635525
--- /dev/null
+++ b/colpali-main/scripts/configs/siglip/train_bisiglip_model.yaml
@@ -0,0 +1,40 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_real_siglip_text_only
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384
+    max_length: 64
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384
+    training_objective: "biencoder_mean"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 64
+  run_train: true
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)'
diff --git a/colpali-main/scripts/configs/siglip/train_colsiglip_model.yaml b/colpali-main/scripts/configs/siglip/train_colsiglip_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..72e63f59d4ae02a2fbe3f88192ea9bcb52f2fc8d
--- /dev/null
+++ b/colpali-main/scripts/configs/siglip/train_colsiglip_model.yaml
@@ -0,0 +1,42 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_real_colsiglip_text_only
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384
+    max_length: 64
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384
+    training_objective: "colbertv1"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.bfloat16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set
+  eval_dataset_loader: !import ../data/test_data.yaml
+
+  max_length: 64
+  run_train: true
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  tr_args: !import ../tr_args/default_tr_args.yaml
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    # target_modules: '(.*(text_model|vision_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)'
+    target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)'
+
diff --git a/colpali-main/scripts/configs/siglip/train_siglip_model_debug.yaml b/colpali-main/scripts/configs/siglip/train_siglip_model_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..86e440fc700de4ed86f77f19873e537b5d5464f0
--- /dev/null
+++ b/colpali-main/scripts/configs/siglip/train_siglip_model_debug.yaml
@@ -0,0 +1,60 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_real_siglip
+  processor:
+    () : colpali_engine.utils.wrapper.AutoProcessorWrapper
+    pretrained_model_name_or_path: google/siglip-so400m-patch14-384
+    max_length: 64
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: google/siglip-so400m-patch14-384
+    training_objective: "biencoder_mean"
+    # attn_implementation: "eager"
+    torch_dtype:  !ext torch.float16
+#    device_map: "auto"
+#    quantization_config:
+#      (): transformers.BitsAndBytesConfig
+#      load_in_4bit: true
+#      bnb_4bit_quant_type: "nf4"
+#      bnb_4bit_compute_dtype:  "bfloat16"
+#      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset
+  eval_dataset_loader: !import ../data/debug_data.yaml
+
+  max_length: 64
+  run_train: true
+  run_eval: true
+  add_suffix: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 1
+    per_device_train_batch_size: 2
+    max_steps: 10
+    # 6 x 8 gpus = 48 batch size
+    # gradient_accumulation_steps: 4
+    per_device_eval_batch_size: 2
+    eval_strategy: "steps"
+    # dataloader_num_workers: 8
+    # bf16: true
+    save_steps: 500
+    logging_steps: 10
+    eval_steps: 50
+    warmup_steps: 100
+    learning_rate: 5e-5
+    save_total_limit: 1
+    optim: "paged_adamw_8bit"
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 32
+    lora_alpha: 32
+    lora_dropout: 0.1
+    init_lora_weights: "gaussian"
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)'
diff --git a/colpali-main/scripts/configs/text_only/train_bibert_model.yaml b/colpali-main/scripts/configs/text_only/train_bibert_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c44f68b8508d9a6bf7a91f184b1881f469575dda
--- /dev/null
+++ b/colpali-main/scripts/configs/text_only/train_bibert_model.yaml
@@ -0,0 +1,28 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/bixlm-roberta-base
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "FacebookAI/xlm-roberta-base"
+    training_objective: "biencoder"
+  add_suffix: true
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings
+  max_length: 256
+  run_eval: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 64
+    gradient_accumulation_steps: 2
+    per_device_eval_batch_size: 8
+    dataloader_num_workers: 8
+    # bf16: true
+    save_steps: 500
+    logging_steps: 50
+    warmup_steps: 500
+    learning_rate: 5e-5
+    save_total_limit: 1
diff --git a/colpali-main/scripts/configs/text_only/train_colbert_model.yaml b/colpali-main/scripts/configs/text_only/train_colbert_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d3237715ad8d89eacd46a2ee1508a4ebfef4e11a
--- /dev/null
+++ b/colpali-main/scripts/configs/text_only/train_colbert_model.yaml
@@ -0,0 +1,27 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "FacebookAI/xlm-roberta-base"
+
+  add_suffix: true
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings
+  max_length: 256
+  run_eval: true
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 64
+    gradient_accumulation_steps: 2
+    per_device_eval_batch_size: 8
+    dataloader_num_workers: 8
+    # bf16: true
+    save_steps: 500
+    logging_steps: 50
+    warmup_steps: 500
+    learning_rate: 5e-5
+    save_total_limit: 1
diff --git a/colpali-main/scripts/configs/text_only/train_colbert_model_debug.yaml b/colpali-main/scripts/configs/text_only/train_colbert_model_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ec4c6d8e8ee758a311f582cff1425ec82464289a
--- /dev/null
+++ b/colpali-main/scripts/configs/text_only/train_colbert_model_debug.yaml
@@ -0,0 +1,42 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "bert-base-uncased"
+    training_objective: "biencoder" # "biencoder"
+    torch_dtype: !ext torch.float16
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings
+
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.BiEncoderLoss # BiEncoderLoss # ColbertLoss
+  max_length: 128
+  run_eval: true
+  run_train: true
+  add_suffix: true
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 1
+    max_steps: 10
+    per_device_train_batch_size: 8
+    gradient_accumulation_steps: 2
+    per_device_eval_batch_size: 8
+    dataloader_num_workers: 8
+    bf16: false
+    save_steps: 50
+    eval_steps: 50
+    eval_strategy: "steps"
+    logging_steps: 10
+    warmup_steps: 10
+    learning_rate: 5e-4
+    save_total_limit: 1
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 16
+    lora_alpha: 32
+    lora_dropout: 0.05
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
\ No newline at end of file
diff --git a/colpali-main/scripts/configs/text_only/train_colllama_model.yaml b/colpali-main/scripts/configs/text_only/train_colllama_model.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b5081cf94dc60bc0599698a46e995a76f7ff50ba
--- /dev/null
+++ b/colpali-main/scripts/configs/text_only/train_colllama_model.yaml
@@ -0,0 +1,53 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  output_dir: !path ../../../models/lora_CroissantCool-v0.2
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "croissantllm/CroissantCool-v0.2"
+    attn_implementation: "flash_attention_2"
+    quantization_config:
+      (): transformers.BitsAndBytesConfig
+      load_in_4bit: true
+      bnb_4bit_quant_type: "nf4"
+      bnb_4bit_compute_dtype:  "bfloat16"
+      bnb_4bit_use_double_quant: true
+      
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  max_length: 256
+  run_eval: true
+  add_suffix: true
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 3
+    per_device_train_batch_size: 128
+    gradient_accumulation_steps: 1
+    per_device_eval_batch_size: 32
+    dataloader_num_workers: 8
+    bf16: true
+    save_steps: 500
+    eval_steps: 50
+    eval_strategy: "steps"
+    logging_steps: 10
+    warmup_steps: 100
+    learning_rate: 5e-5
+    save_total_limit: 1
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 16
+    lora_alpha: 32
+    lora_dropout: 0.05
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules:
+      - 'up_proj'
+      - 'down_proj'
+      - 'gate_proj'
+      - 'k_proj'
+      - 'q_proj'
+      - 'v_proj'
+      - 'o_proj'
diff --git a/colpali-main/scripts/configs/text_only/train_colllama_model_debug.yaml b/colpali-main/scripts/configs/text_only/train_colllama_model_debug.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5e750399b54b23305d8bf84fef79d6421e3d3013
--- /dev/null
+++ b/colpali-main/scripts/configs/text_only/train_colllama_model_debug.yaml
@@ -0,0 +1,51 @@
+config:
+  (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig
+  model:
+    (): colpali_engine.utils.wrapper.AutoColModelWrapper
+    pretrained_model_name_or_path: "HuggingFaceM4/tiny-random-LlamaForCausalLM"
+    # attn_implementation: "eager"
+    quantization_config:
+      (): transformers.BitsAndBytesConfig
+      load_in_4bit: true
+      bnb_4bit_quant_type: "nf4"
+      bnb_4bit_compute_dtype:  "float16"
+      bnb_4bit_use_double_quant: true
+
+  dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings
+  loss_func:
+    (): colpali_engine.loss.colbert_loss.ColbertLoss
+  max_length: 256
+  run_eval: true
+  add_suffix: true
+  tr_args:
+    (): transformers.training_args.TrainingArguments
+    output_dir: null
+    overwrite_output_dir: true
+    num_train_epochs: 1
+    per_device_train_batch_size: 64
+    gradient_accumulation_steps: 2
+    per_device_eval_batch_size: 8
+    dataloader_num_workers: 8
+    bf16: false
+    save_steps: 500
+    logging_steps: 10
+    warmup_steps: 500
+    learning_rate: 5e-5
+    save_total_limit: 1
+    optim: "paged_adamw_8bit"
+
+  peft_config:
+    (): peft.LoraConfig
+    r: 16
+    lora_alpha: 32
+    lora_dropout: 0.05
+    bias: "none"
+    task_type: "FEATURE_EXTRACTION"
+    target_modules:
+      - 'up_proj'
+      - 'down_proj'
+      - 'gate_proj'
+      - 'k_proj'
+      - 'q_proj'
+      - 'v_proj'
+      - 'o_proj'
diff --git a/colpali-main/scripts/configs/tr_args/default_tr_args.yaml b/colpali-main/scripts/configs/tr_args/default_tr_args.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..03186555b5c266dc57a9ffd68e752489ec281639
--- /dev/null
+++ b/colpali-main/scripts/configs/tr_args/default_tr_args.yaml
@@ -0,0 +1,18 @@
+(): transformers.training_args.TrainingArguments
+output_dir: null
+overwrite_output_dir: true
+num_train_epochs: 1
+per_device_train_batch_size: 4
+# 6 x 8 gpus = 48 batch size
+# gradient_accumulation_steps: 4
+per_device_eval_batch_size: 4
+eval_strategy: "steps"
+# dataloader_num_workers: 8
+# bf16: true
+save_steps: 500
+logging_steps: 10
+eval_steps: 50
+warmup_steps: 100
+learning_rate: 5e-5
+save_total_limit: 1
+# optim: "paged_adamw_8bit"
diff --git a/colpali-main/scripts/configs/tr_args/eval_tr_args.yaml b/colpali-main/scripts/configs/tr_args/eval_tr_args.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e6fb7ea7f7ffaab41c5e2ef03f0a3ea12cb753e0
--- /dev/null
+++ b/colpali-main/scripts/configs/tr_args/eval_tr_args.yaml
@@ -0,0 +1,19 @@
+  (): transformers.training_args.TrainingArguments
+  output_dir: null
+  overwrite_output_dir: true
+  num_train_epochs: 1
+  per_device_train_batch_size: 4
+  # 6 x 8 gpus = 48 batch size
+  # gradient_accumulation_steps: 4
+  per_device_eval_batch_size: 4
+  max_steps: 10
+  eval_strategy: "steps"
+  # dataloader_num_workers: 8
+  # bf16: true
+  save_steps: 500
+  logging_steps: 10
+  eval_steps: 50
+  warmup_steps: 100
+  learning_rate: 5e-5
+  save_total_limit: 1
+  optim: "paged_adamw_8bit"
\ No newline at end of file
diff --git a/colpali-main/scripts/infer/run_inference_with_python.py b/colpali-main/scripts/infer/run_inference_with_python.py
new file mode 100644
index 0000000000000000000000000000000000000000..56384efedf61cf63722c0eee977fd3fa89670340
--- /dev/null
+++ b/colpali-main/scripts/infer/run_inference_with_python.py
@@ -0,0 +1,68 @@
+import sys
+import os
+
+import torch
+import typer
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+from transformers import AutoProcessor
+from PIL import Image
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+
+from colpali_engine.models.paligemma_colbert_architecture import ColPali
+from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator
+from colpali_engine.utils.colpali_processing_utils import process_images, process_queries
+from colpali_engine.utils.image_from_page_utils import load_from_dataset
+
+
+def main() -> None:
+    """Example script to run inference with ColPali"""
+
+    # Load model
+    model_name = "vidore/colpali"
+    model = ColPali.from_pretrained("google/paligemma-3b-mix-448", torch_dtype=torch.bfloat16, device_map="cpu").eval()
+    model.load_adapter(model_name)
+    processor = AutoProcessor.from_pretrained(model_name)
+
+    # select images -> load_from_pdf(<pdf_path>),  load_from_image_urls(["<url_1>"]), load_from_dataset(<path>)
+    images = load_from_dataset("vidore/docvqa_test_subsampled")
+    queries = ["From which university does James V. Fiorca come ?", "Who is the japanese prime minister?"]
+
+    # run inference - docs
+    dataloader = DataLoader(
+        images,
+        batch_size=4,
+        shuffle=False,
+        collate_fn=lambda x: process_images(processor, x),
+    )
+    ds = []
+    for batch_doc in tqdm(dataloader):
+        with torch.no_grad():
+            batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
+            embeddings_doc = model(**batch_doc)
+        ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
+
+    # run inference - queries
+    dataloader = DataLoader(
+        queries,
+        batch_size=4,
+        shuffle=False,
+        collate_fn=lambda x: process_queries(processor, x, Image.new("RGB", (448, 448), (255, 255, 255))),
+    )
+
+    qs = []
+    for batch_query in dataloader:
+        with torch.no_grad():
+            batch_query = {k: v.to(model.device) for k, v in batch_query.items()}
+            embeddings_query = model(**batch_query)
+        qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
+
+    # run evaluation
+    retriever_evaluator = CustomEvaluator(is_multi_vector=True)
+    scores = retriever_evaluator.evaluate(qs, ds)
+    print(scores.argmax(axis=1))
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/colpali-main/scripts/train/train_colbert.py b/colpali-main/scripts/train/train_colbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..f06c299101bd4bf7a160aafe0a24157deddda8c3
--- /dev/null
+++ b/colpali-main/scripts/train/train_colbert.py
@@ -0,0 +1,29 @@
+from pathlib import Path
+from colpali_engine.utils.train_colpali_engine_models import ColModelTrainingConfig, ColModelTraining
+from colpali_engine.utils.gpu_stats import print_gpu_utilization
+import typer
+import configue
+
+
+def main(config_file: Path) -> None:
+    print_gpu_utilization()
+    print("Loading config")
+    config = configue.load(config_file, sub_path="config")
+    print("Creating Setup")
+    if isinstance(config, ColModelTrainingConfig):
+        app = ColModelTraining(config)
+    else:
+        raise ValueError("Config must be of type ColModelTrainingConfig")
+
+    if config.run_train:
+        print("Training model")
+        app.train()
+        app.save(config_file=config_file)
+    if config.run_eval:
+        print("Running evaluation")
+        app.eval()
+    print("Done!")
+
+
+if __name__ == "__main__":
+    typer.run(main)