diff --git a/colpali-main/.env.dist b/colpali-main/.env.dist new file mode 100644 index 0000000000000000000000000000000000000000..2e8d4d39fd00b0c9a5e43d7c64cc0820e79a897e --- /dev/null +++ b/colpali-main/.env.dist @@ -0,0 +1,5 @@ +HF_TOKEN= +HF_DATASETS_CACHE= + +VERTEX_PROJECT= +VERTEX_LOCATION= diff --git a/colpali-main/.gitattributes b/colpali-main/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..d08d846a4fdc0b92a8e8b9b615b06f4978da1445 --- /dev/null +++ b/colpali-main/.gitattributes @@ -0,0 +1,31 @@ +*.jsonl filter=lfs diff=lfs merge=lfs -text +*.csv filter=lfs diff=lfs merge=lfs -text +*.ipynb filter=lfs diff=lfs merge=lfs -text + +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bin.* filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text \ No newline at end of file diff --git a/colpali-main/.gitignore b/colpali-main/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d5bb3faf053df45809d5fccb0a3168bc92cb8876 --- /dev/null +++ b/colpali-main/.gitignore @@ -0,0 +1,179 @@ +# Custom +.DS_Store +.env +.litellm_cache/ +data/litellm_cache_captionning/ +.idea +.venv/ +colbert/models/ +logs/ +data/downloaded_datasets/rimes_raw_dataset/ +models/ +!colpali_engine/models +data/ +!*/configs/data/ +data_dir/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/*.png + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/colpali-main/.python-version b/colpali-main/.python-version new file mode 100644 index 0000000000000000000000000000000000000000..375f5cabfe6cd1337c375dfa0dbc7fbd3180edb9 --- /dev/null +++ b/colpali-main/.python-version @@ -0,0 +1 @@ +3.11.6 diff --git a/colpali-main/LICENSE b/colpali-main/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..c67416730fed7bab154d250168c820fcc27eff97 --- /dev/null +++ b/colpali-main/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Manuel Faysse, Hugues Sibille, Tony Wu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/colpali-main/README.md b/colpali-main/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c26dfa7c309b6b3bb8bf7d9695f2b1532d5bb1d1 --- /dev/null +++ b/colpali-main/README.md @@ -0,0 +1,222 @@ +# ColPali: Efficient Document Retrieval with Vision Language Models + + +[[Blog]](https://huggingface.co/blog/manu/colpali) +[[Paper]](https://arxiv.org/abs/2407.01449) +[[ColPali Model card]](https://huggingface.co/vidore/colpali) +[[ViDoRe Benchmark]](https://huggingface.co/vidore) + +[[HuggingFace Demo]](https://huggingface.co/spaces/manu/ColPali-demo) + + +## Associated Paper + +**ColPali: Efficient Document Retrieval with Vision Language Models** +Manuel Faysse, Hugues Sibille, Tony Wu, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo + +This repository contains the code for training custom Colbert retriever models. +Notably, we train colbert with LLMs (decoders) as well as Image Language models ! + +## Installation + +### From git +```bash +pip install git+https://github.com/illuin-tech/colpali +``` + +### From source +```bash +git clone https://github.com/illuin-tech/colpali +mv colpali +pip install -r requirements.txt +``` + +## Usage + +Example usage of the model is shown in the `scripts` directory. + +```bash +# hackable example script to adapt +python scripts/infer/run_inference_with_python.py +``` + + +```python +import torch +import typer +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoProcessor +from PIL import Image + +from colpali_engine.models.paligemma_colbert_architecture import ColPali +from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator +from colpali_engine.utils.colpali_processing_utils import process_images, process_queries +from colpali_engine.utils.image_from_page_utils import load_from_dataset + + +def main() -> None: + """Example script to run inference with ColPali""" + # Load model + model_name = "vidore/colpali" + model = ColPali.from_pretrained("google/paligemma-3b-mix-448", torch_dtype=torch.bfloat16, device_map="cuda").eval() + model.load_adapter(model_name) + processor = AutoProcessor.from_pretrained(model_name) + + # select images -> load_from_pdf(), load_from_image_urls([""]), load_from_dataset() + images = load_from_dataset("vidore/docvqa_test_subsampled") + queries = ["From which university does James V. Fiorca come ?", "Who is the japanese prime minister?"] + + # run inference - docs + dataloader = DataLoader( + images, + batch_size=4, + shuffle=False, + collate_fn=lambda x: process_images(processor, x), + ) + ds = [] + for batch_doc in tqdm(dataloader): + with torch.no_grad(): + batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()} + embeddings_doc = model(**batch_doc) + ds.extend(list(torch.unbind(embeddings_doc.to("cpu")))) + + # run inference - queries + dataloader = DataLoader( + queries, + batch_size=4, + shuffle=False, + collate_fn=lambda x: process_queries(processor, x, Image.new("RGB", (448, 448), (255, 255, 255))), + ) + + qs = [] + for batch_query in dataloader: + with torch.no_grad(): + batch_query = {k: v.to(model.device) for k, v in batch_query.items()} + embeddings_query = model(**batch_query) + qs.extend(list(torch.unbind(embeddings_query.to("cpu")))) + + # run evaluation + retriever_evaluator = CustomEvaluator(is_multi_vector=True) + scores = retriever_evaluator.evaluate(qs, ds) + print(scores.argmax(axis=1)) + + +if __name__ == "__main__": + typer.run(main) +``` + +Detais are also given in the model card for the base Colpali model on HuggingFace: [ColPali Model card](https://huggingface.co/vidore/colpali). + +## Training + +```bash +USE_LOCAL_DATASET=0 python scripts/train/train_colbert.py scripts/configs/siglip/train_siglip_model_debug.yaml +``` + +or + +```bash +accelerate launch scripts/train/train_colbert.py scripts/configs/train_colidefics_model.yaml +``` + +### Configurations +All training arguments can be set through a configuration file. +The configuration file is a yaml file that contains all the arguments for training. + +The construction is as follows: + +```python +@dataclass +class ColModelTrainingConfig: + model: PreTrainedModel + tr_args: TrainingArguments = None + output_dir: str = None + max_length: int = 256 + run_eval: bool = True + run_train: bool = True + peft_config: Optional[LoraConfig] = None + add_suffix: bool = False + processor: Idefics2Processor = None + tokenizer: PreTrainedTokenizer = None + loss_func: Optional[Callable] = ColbertLoss() + dataset_loading_func: Optional[Callable] = None + eval_dataset_loader: Optional[Dict[str, Callable]] = None + pretrained_peft_model_name_or_path: Optional[str] = None +``` +### Example + +An example configuration file is: + +```yaml +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' +``` + + +#### Local training + +```bash +USE_LOCAL_DATASET=0 python scripts/train/train_colbert.py scripts/configs/siglip/train_siglip_model_debug.yaml +``` + + +#### SLURM + +```bash +sbatch --nodes=1 --cpus-per-task=16 --mem-per-cpu=32GB --time=20:00:00 --gres=gpu:1 -p gpua100 --job-name=colidefics --output=colidefics.out --error=colidefics.err --wrap="accelerate launch scripts/train/train_colbert.py scripts/configs/train_colidefics_model.yaml" + +sbatch --nodes=1 --time=5:00:00 -A cad15443 --gres=gpu:8 --constraint=MI250 --job-name=colpali --wrap="python scripts/train/train_colbert.py scripts/configs/train_colpali_model.yaml" +``` + +## CITATION + +```bibtex +@misc{faysse2024colpaliefficientdocumentretrieval, + title={ColPali: Efficient Document Retrieval with Vision Language Models}, + author={Manuel Faysse and Hugues Sibille and Tony Wu and Bilel Omrani and Gautier Viaud and Céline Hudelot and Pierre Colombo}, + year={2024}, + eprint={2407.01449}, + archivePrefix={arXiv}, + primaryClass={cs.IR}, + url={https://arxiv.org/abs/2407.01449}, +} +``` \ No newline at end of file diff --git a/colpali-main/colpali_engine/__init__.py b/colpali-main/colpali_engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/colpali-main/colpali_engine/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f2edc71e0a26eeaedc5321d2694a6e4f37e67f44 Binary files /dev/null and b/colpali-main/colpali_engine/__pycache__/__init__.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/dataset/__init__.py b/colpali-main/colpali_engine/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/colpali-main/colpali_engine/dataset/custom_collator.py b/colpali-main/colpali_engine/dataset/custom_collator.py new file mode 100644 index 0000000000000000000000000000000000000000..5254f31a9b870f2fdf691b312bb13f58e493bae3 --- /dev/null +++ b/colpali-main/colpali_engine/dataset/custom_collator.py @@ -0,0 +1,244 @@ +from transformers import PreTrainedTokenizer, ProcessorMixin + + +class CustomCollator: + def __init__( + self, + processor: ProcessorMixin = None, + tokenizer: PreTrainedTokenizer = None, + max_length: int = 2048, + add_suffix: bool = False, + ): + self.processor = processor + self.tokenizer = tokenizer + self.image_token_id = None + self.max_length = max_length + self.suffix = "" + if add_suffix: + self.suffix = "\n" * 10 + + if tokenizer is None and processor is None: + raise ValueError("Either processor or tokenizer should be provided.") + + if self.processor is not None: + if self.processor.__class__.__name__ != "SiglipProcessor": + self.image_token_id = self.processor.tokenizer.additional_special_tokens_ids[ + self.processor.tokenizer.additional_special_tokens.index("") + ] + + if self.tokenizer is not None: + raise ValueError("Only one of processor or tokenizer should be provided.") + + if self.tokenizer and self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + + def __call__(self, examples): + if self.processor is None: + return self.forward_text(examples) + if self.processor.__class__.__name__ == "Idefics2Processor": + return self.forward_vision_idefics(examples) + if self.processor.__class__.__name__ == "PaliGemmaProcessor": + return self.forward_vision_pali(examples) + if self.processor.__class__.__name__ == "SiglipProcessor": + return self.forward_vision_siglip(examples) + raise ValueError("Processor not supported") + + def forward_text(self, examples): + texts_doc = [] + texts_query = [] + for example in examples: + text_query = example["query"] + self.suffix + text_doc = example["doc"] + + texts_doc.append(text_doc.strip()) + texts_query.append(text_query.strip()) + + batch_doc = self.tokenizer( + texts_doc, max_length=self.max_length, padding="longest", truncation=True, return_tensors="pt" + ) + batch_query = self.tokenizer( + texts_query, max_length=self.max_length, padding="longest", truncation=True, return_tensors="pt" + ) + + # prefix each key with "doc_" or "query_" to avoid key conflicts + batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()} + batch_query = {f"query_{k}": v for k, v in batch_query.items()} + batch_doc.update(batch_query) + + return batch_doc + + def forward_vision_idefics(self, examples): + texts_doc = [] + texts_query = [] + images = [] + for example in examples: + image = example["image"] + + text_query = None + if example["query"] is not None: + query = example["query"] + messages_query = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Question: {query}", + }, + ], + }, + ] + text_query = self.processor.apply_chat_template(messages_query, add_generation_prompt=False).strip() + + messages_doc = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image."}, + {"type": "image"}, + ], + }, + ] + + text_doc = self.processor.apply_chat_template(messages_doc, add_generation_prompt=False) + + texts_doc.append(text_doc.strip()) + texts_query.append(text_query) + images.append([image]) + + batch_doc = self.processor( + text=texts_doc, images=images, return_tensors="pt", padding="longest", max_length=self.max_length + ) + + batch_query = None + if all([t is None for t in texts_query]): + print("All queries are None. Returning None for all queries.") + elif any([t is None for t in texts_query]): + raise ValueError("Some queries are None. This collator does not support None queries yet.") + else: + batch_query = self.processor( + text=texts_query, return_tensors="pt", padding="longest", max_length=self.max_length + ) + + # prefix each key with "doc_" or "query_" to avoid key conflicts + batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()} + + if batch_query is not None: + batch_query = {f"query_{k}": v for k, v in batch_query.items()} + batch_doc.update(batch_query) + + return batch_doc + + def forward_vision_pali(self, examples): + texts_doc = [] + texts_query = [] + images = [] + for example in examples: + + if example["image"] is None: + raise ValueError("Image is None - This collator does not support None images yet.") + + image = example["image"].convert("RGB") + images.append(image) + texts_doc.append("Describe the image.") + + if example["query"] is None: + texts_query.append(None) + else: + query = example["query"] + query = f"Question: {query}" + texts_query.append(query) + + batch_doc = self.processor( + text=texts_doc, + images=images, + return_tensors="pt", + padding="longest", + max_length=self.max_length + self.processor.image_seq_length, + ) + + batch_query = None + # check if some but not all queries are None + if all([t is None for t in texts_query]): + print("All queries are None. Returning None for all queries.") + elif any([t is None for t in texts_query]): + raise ValueError("Some queries are None. This collator does not support None queries yet.") + else: + batch_query = self.processor( + images=images, # NOTE: the image is not used in batch_query but it is required for calling the processor + text=texts_query, + return_tensors="pt", + padding="longest", + max_length=self.max_length + self.processor.image_seq_length, + ) + del batch_query["pixel_values"] + batch_query["input_ids"] = batch_query["input_ids"][..., self.processor.image_seq_length :] + batch_query["attention_mask"] = batch_query["attention_mask"][..., self.processor.image_seq_length :] + + # prefix each key with "doc_" or "query_" to avoid key conflicts + batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()} + + if batch_query is not None: + batch_query = {f"query_{k}": v for k, v in batch_query.items()} + batch_doc.update(batch_query) + + return batch_doc + + def forward_vision_siglip(self, examples): + texts_doc = [] + texts_query = [] + images = [] + for example in examples: + + if example["image"] is None: + raise ValueError("Image is None - This collator does not support None images yet.") + + image = example["image"].convert("RGB") + images.append(image) + texts_doc.append("Describe the image.") + + if example["query"] is None: + texts_query.append(None) + else: + query = f"Question: {example['query']}" + texts_query.append(query) + + batch_doc = self.processor( + text=texts_doc, + images=images, + return_tensors="pt", + padding="max_length", + truncation=True, + ) + + batch_query = None + # check if some but not all queries are None + if all([t is None for t in texts_query]): + # print("All queries are None.") + pass + elif any([t is None for t in texts_query]): + raise ValueError("Some queries are None. This collator does not support None queries yet.") + else: + batch_query = self.processor( + images=images, + text=texts_query, + return_tensors="pt", + padding="max_length", + max_length=self.max_length, + truncation=True, + ) + del batch_query["pixel_values"] + + # prefix each key with "doc_" or "query_" to avoid key conflicts + batch_doc = {f"doc_{k}": v for k, v in batch_doc.items()} + + if batch_query is not None: + batch_query = {f"query_{k}": v for k, v in batch_query.items()} + batch_doc.update(batch_query) + # add attention mask for queries + batch_doc["query_attention_mask"] = batch_doc["query_input_ids"].ne(0).long() + + # add attention mask for docs + batch_doc["doc_attention_mask"] = batch_doc["doc_input_ids"].ne(0).long() + + return batch_doc diff --git a/colpali-main/colpali_engine/dataset/hf_dataset_names.py b/colpali-main/colpali_engine/dataset/hf_dataset_names.py new file mode 100644 index 0000000000000000000000000000000000000000..27ca88090671effdbbd31dbd359dbbe1cfb63bb4 --- /dev/null +++ b/colpali-main/colpali_engine/dataset/hf_dataset_names.py @@ -0,0 +1,52 @@ +from enum import Enum + + +class TrainDatasets(Enum): + """ + Dataset names for the training datasets used in HuggingFace Datasets. + """ + + government_reports = "vidore/syntheticDocQA_government_reports_train" + healthcare_industry = "vidore/syntheticDocQA_healthcare_industry_train" + energy = "vidore/syntheticDocQA_energy_train" + artificial_intelligence = "vidore/syntheticDocQA_artificial_intelligence_train" + arxivqa = "vidore/arxivqa_train" + docvqa = "vidore/docvqa_train" + infovqa = "vidore/infovqa_train" + tatqa = "vidore/tatqa_train" + + @staticmethod + def get_synthetic_datasets(): + return [ + TrainDatasets.government_reports, + TrainDatasets.healthcare_industry, + TrainDatasets.energy, + TrainDatasets.artificial_intelligence, + ] + + +class TestImagesDirpath(Enum): + """ + Dataset names for the test datasets used in HuggingFace Datasets. + """ + + government_reports = "data/government_reports" + healthcare_industry = "data/healthcare_industry" + energy = "data/energy" + artificial_intelligence = "data/scrapped_pdfs_split/pages_extracted/artificial_intelligence_test" + arxivqa = "data/arxivqa" + docvqa = "data/docvqa" + infovqa = "data/infovqa" + tatqa = "data/tatqa" + + +class CaptionedSyntheticDatasets(Enum): + """ + Dataset names for the captioned synthetic datasets used in HuggingFace Datasets. + """ + + shift = "vidore/baseline_cap_shiftproject_test" + + +class SyntheticDocQATest(Enum): + shift = "vidore/shiftproject_test" diff --git a/colpali-main/colpali_engine/evaluation/__init__.py b/colpali-main/colpali_engine/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a92c4b6306f451b5db4bef85ca3d7e0840b2e431 --- /dev/null +++ b/colpali-main/colpali_engine/evaluation/__init__.py @@ -0,0 +1 @@ +from .eval_manager import EvalManager diff --git a/colpali-main/colpali_engine/evaluation/eval_manager.py b/colpali-main/colpali_engine/evaluation/eval_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..30d1ff67485ad35bb5f5b7ce2311b53b7195e871 --- /dev/null +++ b/colpali-main/colpali_engine/evaluation/eval_manager.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any, ClassVar, Dict, Optional + +import pandas as pd + + +class EvalManager: + """ + Stores evaluation results for various datasets and metrics. + + The data is stored in a pandas DataFrame with a MultiIndex for columns. + The first level of the MultiIndex is the dataset name and the second level is the metric name. + + Usage: + >>> evaluator = Evaluator.from_dirpath("data/evaluation_results/") + >>> print(evaluator.data) + + """ + + model_col: ClassVar[str] = "model" + dataset_col: ClassVar[str] = "dataset" + metric_col: ClassVar[str] = "metric" + + def __init__(self, data: Optional[pd.DataFrame] = None): + if data is None: + data = pd.DataFrame() + self._df = data + self._df.index = self._df.index.rename(EvalManager.model_col) + + def __str__(self) -> str: + return self.data.__str__() + + @staticmethod + def from_dict(data: Dict[Any, Any]) -> EvalManager: + """ + Load evaluation results from a dictionary. + + Expected format: + { + "model1": pd.read_json(path1).T.stack(), + "model2": pd.read_json(path2).T.stack(), + } + + """ + df = pd.DataFrame.from_dict(data, orient="index") + return EvalManager(df) + + @staticmethod + def from_json(path: str | Path) -> EvalManager: + datapath = Path(path) + if not datapath.is_file(): + raise FileNotFoundError(f"{path} is not a file") + data = {} + data[datapath.stem] = pd.read_json(datapath).T.stack() # pylint: disable=no-member + return EvalManager.from_dict(data) + + @staticmethod + def from_dir(datadir: str | Path) -> EvalManager: + datadir_ = Path(datadir) + if not datadir_.is_dir(): + raise FileNotFoundError(f"{datadir} is not a directory") + + eval_files = list(datadir_.glob("*.json")) + + data = {} + + for filepath in eval_files: + data[filepath.stem] = pd.read_json(filepath).T.stack() # pylint: disable=no-member + + return EvalManager.from_dict(data) + + @staticmethod + def from_csv(path: str | Path) -> EvalManager: + """ + Load evaluation results from a CSV file. + """ + try: + df = pd.read_csv(path, index_col=0, header=[0, 1]) + return EvalManager(df) + except Exception as e: + print(f"Error loading {path}: {e}") + raise e + + @property + def data(self) -> pd.DataFrame: + """ + Returns the evaluation results as a pandas DataFrame. + """ + return self._df.copy() + + @property + def models(self) -> pd.Index: + """ + Returns the models for which there are evaluation results. + """ + return self.data.index + + @property + def datasets(self) -> pd.Index: + """ + Returns the datasets for which there are evaluation results. + """ + return self.data.columns.get_level_values(0).unique() + + @property + def metrics(self) -> pd.Index: + """ + Returns the metrics for which there are evaluation results. + """ + return self.data.columns.get_level_values(1) + + @staticmethod + def melt(df: pd.DataFrame) -> pd.DataFrame: + """ + Melt a suitable DataFrame (e.g. returned by `get_df_for_dataset` and + `get_df_for_metric`) into a 'long' format. + """ + return df.T.reset_index(names=[EvalManager.dataset_col, EvalManager.metric_col]).melt( + id_vars=[EvalManager.dataset_col, EvalManager.metric_col], + var_name=EvalManager.model_col, + value_name="score", + ) + + @property + def melted(self) -> pd.DataFrame: + """ + Returns the evaluation results as a 'melted' DataFrame. + Useful for plotting with seaborn. + """ + return EvalManager.melt(self.data) + + def get_df_for_model(self, model: str) -> pd.DataFrame: + if model not in self.data.index: + raise ValueError(f"Model {model} not found in the evaluation results") + return self.data.loc[[model], :] # type: ignore + + def get_df_for_dataset(self, dataset: str) -> pd.DataFrame: + if dataset not in self.datasets: + raise ValueError(f"Dataset {dataset} not found in the evaluation results") + return self.data.loc[:, (dataset, slice(None))] # type: ignore + + def get_df_for_metric(self, metric: str) -> pd.DataFrame: + if metric not in self.metrics: + raise ValueError(f"Metric {metric} not found in the evaluation results") + return self.data.loc[:, (slice(None), metric)] # type: ignore + + def sort_by_dataset(self, ascending: bool = True) -> EvalManager: + """ + Sort the evaluation results by dataset name. + """ + df = self.data.T.sort_index(level=0, ascending=ascending).T + return EvalManager(df) + + def sort_by_metric(self, ascending: bool = True) -> EvalManager: + """ + Sort the evaluation results by metric name. + """ + df = self.data.T.sort_index(level=1, ascending=ascending).T + return EvalManager(df) + + def sort_columns(self, ascending: bool = True) -> EvalManager: + """ + Sort the evaluation results by dataset name and then by metric name. + """ + df = self.data.T.sort_index(level=[0, 1], ascending=ascending).T + return EvalManager(df) + + def to_csv(self, path: str | Path): + """ + Save the evaluation results to a CSV file. + + Using `Evaluation.from_csv(path_to_saved_csv)` will load the evaluation results back into memory. + """ + savepath = Path(path) + savepath.parent.mkdir(parents=True, exist_ok=True) + self.data.to_csv(savepath) diff --git a/colpali-main/colpali_engine/interpretability/__init__.py b/colpali-main/colpali_engine/interpretability/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3e442b0606e379ce675d742b3a996db97a367c62 --- /dev/null +++ b/colpali-main/colpali_engine/interpretability/__init__.py @@ -0,0 +1,4 @@ +from .plot_utils import * +from .processor import * +from .torch_utils import * +from .vit_configs import * diff --git a/colpali-main/colpali_engine/interpretability/gen_interpretability_plots.py b/colpali-main/colpali_engine/interpretability/gen_interpretability_plots.py new file mode 100644 index 0000000000000000000000000000000000000000..5f377482f2cd2d01f6cdccdd2b26d91e4b490f34 --- /dev/null +++ b/colpali-main/colpali_engine/interpretability/gen_interpretability_plots.py @@ -0,0 +1,113 @@ +import pprint +from dataclasses import asdict, dataclass +from pathlib import Path +from uuid import uuid4 + +import matplotlib.pyplot as plt +import torch +from einops import rearrange +from PIL import Image +from tqdm import trange + +from colpali_engine.interpretability.plot_utils import plot_patches +from colpali_engine.interpretability.processor import ColPaliProcessor +from colpali_engine.interpretability.torch_utils import normalize_attention_map_per_query_token +from colpali_engine.interpretability.vit_configs import VIT_CONFIG +from colpali_engine.models.paligemma_colbert_architecture import ColPali + +OUTDIR_INTERPRETABILITY = Path("outputs/interpretability") + + +@dataclass +class InterpretabilityInput: + query: str + image: Image.Image + start_idx_token: int + end_idx_token: int + + +def generate_interpretability_plots( + model: ColPali, + processor: ColPaliProcessor, + query: str, + image: Image.Image, + savedir: str | Path | None = None, + add_special_prompt_to_doc: bool = True, +) -> None: + + # Sanity checks + if len(model.active_adapters()) != 1: + raise ValueError("The model must have exactly one active adapter.") + + if model.config.name_or_path not in VIT_CONFIG: + raise ValueError("The model must be referred to in the VIT_CONFIG dictionary.") + vit_config = VIT_CONFIG[model.config.name_or_path] + + # Handle savepath + if not savedir: + savedir = OUTDIR_INTERPRETABILITY / str(uuid4()) + print(f"No savepath provided. Results will be saved to: `{savedir}`.") + elif isinstance(savedir, str): + savedir = Path(savedir) + savedir.mkdir(parents=True, exist_ok=True) + + # Resize the image to square + input_image_square = image.resize((vit_config.resolution, vit_config.resolution)) + + # Preprocess the inputs + input_text_processed = processor.process_text(query).to(model.device) + input_image_processed = processor.process_image(image, add_special_prompt=add_special_prompt_to_doc).to( + model.device + ) + + # Forward pass + with torch.no_grad(): + output_text = model.forward(**asdict(input_text_processed)) # (1, n_text_tokens, hidden_dim) + + # NOTE: `output_image`` will have shape: + # (1, n_patch_x * n_patch_y, hidden_dim) if `add_special_prompt_to_doc` is False + # (1, n_patch_x * n_patch_y + n_special_tokens, hidden_dim) if `add_special_prompt_to_doc` is True + with torch.no_grad(): + output_image = model.forward(**asdict(input_image_processed)) + + if add_special_prompt_to_doc: # remove the special tokens + output_image = output_image[ + :, : processor.processor.image_seq_length, : + ] # (1, n_patch_x * n_patch_y, hidden_dim) + + output_image = rearrange( + output_image, "b (h w) c -> b h w c", h=vit_config.n_patch_per_dim, w=vit_config.n_patch_per_dim + ) # (1, n_patch_x, n_patch_y, hidden_dim) + + # Get the unnormalized attention map + attention_map = torch.einsum( + "bnk,bijk->bnij", output_text, output_image + ) # (1, n_text_tokens, n_patch_x, n_patch_y) + attention_map_normalized = normalize_attention_map_per_query_token( + attention_map + ) # (1, n_text_tokens, n_patch_x, n_patch_y) + attention_map_normalized = attention_map_normalized.float() + + # Get text token information + n_tokens = input_text_processed.input_ids.size(1) + text_tokens = processor.tokenizer.tokenize(processor.decode(input_text_processed.input_ids[0])) + print("Text tokens:") + pprint.pprint(text_tokens) + print("\n") + + for token_idx in trange(1, n_tokens - 1, desc="Iterating over tokens..."): # exclude the and the "\n" tokens + fig, axis = plot_patches( + input_image_square, + vit_config.patch_size, + vit_config.resolution, + patch_opacities=attention_map_normalized[0, token_idx, :, :], + style="dark_background", + ) + + fig.suptitle(f"Token #{token_idx}: `{text_tokens[token_idx]}`", color="white", fontsize=14) + savepath = savedir / f"token_{token_idx}.png" + fig.savefig(savepath) + print(f"Saved attention map for token {token_idx} (`{text_tokens[token_idx]}`) to `{savepath}`.\n") + plt.close(fig) + + return diff --git a/colpali-main/colpali_engine/interpretability/plot_utils.py b/colpali-main/colpali_engine/interpretability/plot_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..13b41865a5c786acf1848f433a34948b20962293 --- /dev/null +++ b/colpali-main/colpali_engine/interpretability/plot_utils.py @@ -0,0 +1,131 @@ +from typing import Any, Dict, Optional, Tuple, cast + +import matplotlib.pyplot as plt +import numpy as np +import numpy.typing as npt +import seaborn as sns +import torch +from PIL import Image + +MAX_OPACITY = 255 + + +def plot_patches( + img: Image.Image, + patch_size: int, + image_resolution: int, + patch_opacities: Optional[npt.NDArray | torch.Tensor] = None, + figsize: Tuple[int, int] = (8, 8), + style: Dict[str, Any] | str | None = None, +) -> Tuple[plt.Figure, plt.Axes]: + """ + Plot patches of a square image. + Set `style` to "dark_background" if your image has a light background. + """ + + # Get the number of patches + if image_resolution % patch_size != 0: + raise ValueError("The image resolution must be divisible by the patch size.") + num_patches = image_resolution // patch_size + + # Default style + if style is None: + style = {} + + # Sanity checks + if patch_opacities is not None: + if isinstance(patch_opacities, torch.Tensor): + patch_opacities = cast(npt.NDArray, patch_opacities.cpu().numpy()) + if patch_opacities.shape != (num_patches, num_patches): + raise ValueError("The shape of the patch_opacities tensor is not correct.") + if not np.all((0 <= patch_opacities) & (patch_opacities <= 1)): + raise ValueError("The patch_opacities tensor must have values between 0 and 1.") + + # If the image is not square, raise an error + if img.size[0] != img.size[1]: + raise ValueError("The image must be square.") + + # Get the image as a numpy array + img_array = np.array(img.convert("RGBA")) # (H, W, C) where the last channel is the alpha channel + + # Create a figure + with plt.style.context(style): + fig, axis = plt.subplots(num_patches, num_patches, figsize=figsize) + + # Plot the patches + for i in range(num_patches): + for j in range(num_patches): + patch = img_array[i * patch_size : (i + 1) * patch_size, j * patch_size : (j + 1) * patch_size, :] + # Set the opacity of the patch + if patch_opacities is not None: + patch[:, :, -1] = round(patch_opacities[i, j] * MAX_OPACITY) + axis[i, j].imshow(patch) + axis[i, j].axis("off") + + fig.subplots_adjust(wspace=0.1, hspace=0.1) + + fig.tight_layout() + + return fig, axis + + +def plot_attention_heatmap( + img: Image.Image, + patch_size: int, + image_resolution: int, + attention_map: npt.NDArray | torch.Tensor, + figsize: Tuple[int, int] = (8, 8), + style: Dict[str, Any] | str | None = None, + show_colorbar: bool = False, + show_axes: bool = False, +) -> Tuple[plt.Figure, plt.Axes]: + """ + Plot a heatmap of the attention map over the image. + The image must be square and `attention_map` must be normalized between 0 and 1. + """ + + # Get the number of patches + if image_resolution % patch_size != 0: + raise ValueError("The image resolution must be divisible by the patch size.") + num_patches = image_resolution // patch_size + + # Default style + if style is None: + style = {} + + # Sanity checks + if isinstance(attention_map, torch.Tensor): + attention_map = cast(npt.NDArray, attention_map.cpu().numpy()) + if attention_map.shape != (num_patches, num_patches): + raise ValueError("The shape of the patch_opacities tensor is not correct.") + if not np.all((0 <= attention_map) & (attention_map <= 1)): + raise ValueError("The patch_opacities tensor must have values between 0 and 1.") + + # If the image is not square, raise an error + if img.size[0] != img.size[1]: + raise ValueError("The image must be square.") + + # Get the image as a numpy array + img_array = np.array(img.convert("RGBA")) # (H, W, C) where the last channel is the alpha channel + + # Get the attention map as a numpy array + attention_map_image = Image.fromarray((attention_map * 255).astype("uint8")).resize( + img.size, Image.Resampling.BICUBIC + ) + + # Create a figure + with plt.style.context(style): + fig, ax = plt.subplots(figsize=figsize) + ax.imshow(img_array) + im = ax.imshow( + attention_map_image, + cmap=sns.color_palette("mako", as_cmap=True), + alpha=0.5, + ) + if show_colorbar: + fig.colorbar(im) + if not show_axes: + ax.set_axis_off() + fig.tight_layout() + + return fig, ax diff --git a/colpali-main/colpali_engine/interpretability/processor.py b/colpali-main/colpali_engine/interpretability/processor.py new file mode 100644 index 0000000000000000000000000000000000000000..98ac6d5911eac0083d3f1f30031ac5918588ee20 --- /dev/null +++ b/colpali-main/colpali_engine/interpretability/processor.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import List, cast + +import torch +from PIL import Image +from transformers import LlamaTokenizerFast, PaliGemmaProcessor + + +@dataclass +class ColPaliTextInput: + input_ids: torch.Tensor + attention_mask: torch.Tensor + + def to(self, device: torch.device) -> ColPaliTextInput: + return ColPaliTextInput( + input_ids=self.input_ids.to(device), + attention_mask=self.attention_mask.to(device), + ) + + +@dataclass +class ColPaliImageInput: + input_ids: torch.Tensor + pixel_values: torch.Tensor + attention_mask: torch.Tensor + + def to(self, device: str | torch.device) -> ColPaliImageInput: + return ColPaliImageInput( + input_ids=self.input_ids.to(device), + pixel_values=self.pixel_values.to(device), + attention_mask=self.attention_mask.to(device), + ) + + +class ColPaliProcessor: + def __init__(self, processor: PaliGemmaProcessor): + self.processor = processor + self.tokenizer = cast(LlamaTokenizerFast, self.processor.tokenizer) # type: ignore + + @staticmethod + def from_pretrained(model_name: str) -> ColPaliProcessor: + return ColPaliProcessor(processor=cast(PaliGemmaProcessor, PaliGemmaProcessor.from_pretrained(model_name))) + + def process_text( + self, + text: str | List[str], + padding: str = "longest", + return_tensors: str = "pt", + add_special_tokens: bool = True, + ) -> ColPaliTextInput: + """ + Process text inputs for the model. + If `add_special_tokens` is True (default), the text will be prepended with the token and appended with " \n". + """ + if add_special_tokens: + if isinstance(text, str): + text = self.tokenizer.bos_token + text + "\n" + elif isinstance(text, list): + text = [self.tokenizer.bos_token + t + "\n" for t in text] + else: + raise ValueError("text must be a string or a list of strings.") + + batch_output = self.tokenizer( + text, padding=padding, return_tensors=return_tensors, add_special_tokens=add_special_tokens + ) + + return ColPaliTextInput( + input_ids=cast(torch.Tensor, batch_output["input_ids"]), + attention_mask=cast(torch.Tensor, batch_output["attention_mask"]), + ) + + def process_image( + self, + image: Image.Image | List[Image.Image], + padding: str = "longest", + do_convert_rgb: bool = True, + return_tensors: str = "pt", + add_special_prompt: bool = True, + ) -> ColPaliImageInput: + # NOTE: The special prompt was used at training time, + special_prompt = "Describe the image." if add_special_prompt else None + if isinstance(image, Image.Image): + text_input = [special_prompt] + elif isinstance(image, list): + text_input = [special_prompt] * len(image) + else: + raise ValueError("image must be a PIL Image or a list of PIL Images.") + + batch_output = self.processor( + text=text_input, + images=image, + padding=padding, + do_convert_rgb=do_convert_rgb, + return_tensors=return_tensors, + ) + + if add_special_prompt: + return ColPaliImageInput( + input_ids=batch_output["input_ids"], + pixel_values=batch_output["pixel_values"], + attention_mask=batch_output["attention_mask"], + ) + else: + return ColPaliImageInput( + input_ids=batch_output["input_ids"][:, : self.processor.image_seq_length], + pixel_values=batch_output["pixel_values"][:, : self.processor.image_seq_length], + attention_mask=batch_output["attention_mask"][:, : self.processor.image_seq_length], + ) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) diff --git a/colpali-main/colpali_engine/interpretability/torch_utils.py b/colpali-main/colpali_engine/interpretability/torch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2960f3e85e4329ad1def921a0b698bc4536d431d --- /dev/null +++ b/colpali-main/colpali_engine/interpretability/torch_utils.py @@ -0,0 +1,60 @@ +import logging + +import torch + +logger = logging.getLogger(__name__) + +EPSILON = 1e-10 + + +def normalize_attention_map_per_query_token(x: torch.Tensor) -> torch.Tensor: + """ + Normalizes the attention map for ColPali for each query token. + The output tensor will have values in the range [0, 1] and the + same shape as the input tensor. + + Args: + x: The attention map tensor of shape (batch_size, n_text_tokens, n_patch_x, n_patch_y). + """ + if x.ndim != 4: + raise ValueError("The input tensor must have 4 dimensions.") + + # Compute the minimum values along the last two dimensions (n_patch_x, n_patch_y) + min_vals = x.min(dim=-1, keepdim=True)[0].min(dim=-2, keepdim=True)[0] + + # Compute the maximum values along the last two dimensions (n_patch_x, n_patch_y) + max_vals = x.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0] + + # Normalize the tensor + x_normalized = (x - min_vals) / (max_vals - min_vals + EPSILON) # Adding a small epsilon to avoid division by zero + + return x_normalized + + +def normalize_attention_map_per_query(x: torch.Tensor) -> torch.Tensor: + """ + Normalizes the attention map for ColPali for each query token. + The output tensor will have values in the range [0, 1] and the + same shape as the input tensor. + + Args: + x: The attention map tensor of shape (batch_size, n_text_tokens, n_patch_x, n_patch_y). + """ + # Log warning + logger.warning( + "This function should not be used for ColPali because it doesn't make sense to normalize the attention map across the text tokens." + ) + + if x.ndim != 4: + raise ValueError("The input tensor must have 4 dimensions.") + + # Compute the minimum values along the last three dimensions (n_text_tokens, n_patch_x, n_patch_y) + min_vals = x.min(dim=-1, keepdim=True)[0].min(dim=-2, keepdim=True)[0].min(dim=-3, keepdim=True)[0] + + # Compute the maximum values along the last three dimensions (n_text_tokens, n_patch_x, n_patch_y) + max_vals = x.max(dim=-1, keepdim=True)[0].max(dim=-2, keepdim=True)[0].max(dim=-3, keepdim=True)[0] + + # Normalize the tensor + x_normalized = (x - min_vals) / (max_vals - min_vals + EPSILON) # Adding a small epsilon to avoid division by zero + + return x_normalized diff --git a/colpali-main/colpali_engine/interpretability/vit_configs.py b/colpali-main/colpali_engine/interpretability/vit_configs.py new file mode 100644 index 0000000000000000000000000000000000000000..66b4bceb2d030ef206ed36697a27b789f062b96a --- /dev/null +++ b/colpali-main/colpali_engine/interpretability/vit_configs.py @@ -0,0 +1,23 @@ +from dataclasses import dataclass +from typing import Dict + + +@dataclass +class ViTConfig: + patch_size: int + resolution: int + + @property + def n_patch_per_dim(self) -> int: + if self.resolution % self.patch_size != 0: + raise ValueError(f"Resolution {self.resolution} is not divisible by patch size {self.patch_size}") + return self.resolution // self.patch_size + + +VIT_CONFIG: Dict[str, ViTConfig] = { + "google/siglip-so400m-patch14-384": ViTConfig(patch_size=14, resolution=384), + "timm/ViT-SO400M-14-SigLIP-384": ViTConfig(patch_size=14, resolution=384), + "google/paligemma-3b-mix-448": ViTConfig( + patch_size=14, resolution=448 + ), # based on "timm/ViT-SO400M-14-SigLIP-384" with increased resolution +} diff --git a/colpali-main/colpali_engine/loss/__init__.py b/colpali-main/colpali_engine/loss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9613afd7a8f9ea0b6294e4e0df23e6951dcd073a --- /dev/null +++ b/colpali-main/colpali_engine/loss/__init__.py @@ -0,0 +1 @@ +from .colbert_loss import ColbertLoss diff --git a/colpali-main/colpali_engine/loss/colbert_loss.py b/colpali-main/colpali_engine/loss/colbert_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..c52a216b31f3aade638b2adb08f76033f5d7ddb6 --- /dev/null +++ b/colpali-main/colpali_engine/loss/colbert_loss.py @@ -0,0 +1,122 @@ +import torch +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss + + +class BiEncoderLoss(torch.nn.Module): + def __init__(self): + super().__init__() + self.ce_loss = CrossEntropyLoss() + # self.pooling_strategy = pooling_strategy + + def forward(self, query_embeddings, doc_embeddings): + """ + query_embeddings: (batch_size, dim) + doc_embeddings: (batch_size, dim) + """ + + scores = torch.einsum("bd,cd->bc", query_embeddings, doc_embeddings) + + loss_rowwise = self.ce_loss(scores, torch.arange(scores.shape[0], device=scores.device)) + # loss_columnwise = self.ce_loss(scores.T, torch.arange(scores.shape[1], device=scores.device)) + # loss = (loss_rowwise + loss_columnwise) / 2 + return loss_rowwise + + +class ColbertLoss(torch.nn.Module): + def __init__(self): + super().__init__() + self.ce_loss = CrossEntropyLoss() + + def forward(self, query_embeddings, doc_embeddings): + """ + query_embeddings: (batch_size, num_query_tokens, dim) + doc_embeddings: (batch_size, num_doc_tokens, dim) + """ + + scores = torch.einsum("bnd,csd->bcns", query_embeddings, doc_embeddings).max(dim=3)[0].sum(dim=2) + + # scores = torch.zeros((query_embeddings.shape[0], doc_embeddings.shape[0]), device=query_embeddings.device) + # for i in range(query_embeddings.shape[0]): + # for j in range(doc_embeddings.shape[0]): + # # step 1 - dot product --> (s1,s2) + # q2d_scores = torch.matmul(query_embeddings[i], doc_embeddings[j].T) + # # step 2 -> max on doc --> (s1) + # q_scores = torch.max(q2d_scores, dim=1)[0] + # # step 3 --> sum the max score --> (1) + # sum_q_score = torch.sum(q_scores) + # # step 4 --> assert is scalar + # scores[i, j] = sum_q_score + + # assert (scores_einsum - scores < 0.0001).all().item() + + loss_rowwise = self.ce_loss(scores, torch.arange(scores.shape[0], device=scores.device)) + # TODO: comparing between queries might not make sense since it's a sum over the length of the query + # loss_columnwise = self.ce_loss(scores.T, torch.arange(scores.shape[1], device=scores.device)) + # loss = (loss_rowwise + loss_columnwise) / 2 + return loss_rowwise + + +class ColbertPairwiseCELoss(torch.nn.Module): + def __init__(self): + super().__init__() + self.ce_loss = CrossEntropyLoss() + + def forward(self, query_embeddings, doc_embeddings): + """ + query_embeddings: (batch_size, num_query_tokens, dim) + doc_embeddings: (batch_size, num_doc_tokens, dim) + + Positive scores are the diagonal of the scores matrix. + """ + + # Compute the ColBERT scores + scores = ( + torch.einsum("bnd,csd->bcns", query_embeddings, doc_embeddings).max(dim=3)[0].sum(dim=2) + ) # (batch_size, batch_size) + + # Positive scores are the diagonal of the scores matrix. + pos_scores = scores.diagonal() # (batch_size,) + + # Negative score for a given query is the maximum of the scores against all all other pages. + # NOTE: We exclude the diagonal by setting it to a very low value: since we know the maximum score is 1, + # we can subtract 1 from the diagonal to exclude it from the maximum operation. + neg_scores = scores - torch.eye(scores.shape[0], device=scores.device) * 1e6 # (batch_size, batch_size) + neg_scores = neg_scores.max(dim=1)[0] # (batch_size,) + + # Compute the loss + # The loss is computed as the negative log of the softmax of the positive scores + # relative to the negative scores. + # This can be simplified to log-sum-exp of negative scores minus the positive score + # for numerical stability. + # torch.vstack((pos_scores, neg_scores)).T.softmax(1)[:, 0].log()*(-1) + loss = F.softplus(neg_scores - pos_scores).mean() + + return loss + + +class BiPairwiseCELoss(torch.nn.Module): + def __init__(self): + super().__init__() + self.ce_loss = CrossEntropyLoss() + + def forward(self, query_embeddings, doc_embeddings): + """ + query_embeddings: (batch_size, dim) + doc_embeddings: (batch_size, dim) + """ + + scores = torch.einsum("bd,cd->bc", query_embeddings, doc_embeddings) + + pos_scores = scores.diagonal() + neg_scores = scores - torch.eye(scores.shape[0], device=scores.device) * 1e6 + neg_scores = neg_scores.max(dim=1)[0] + + # Compute the loss + # The loss is computed as the negative log of the softmax of the positive scores + # relative to the negative scores. + # This can be simplified to log-sum-exp of negative scores minus the positive score + # for numerical stability. + loss = F.softplus(neg_scores - pos_scores).mean() + + return loss diff --git a/colpali-main/colpali_engine/models/__init__.py b/colpali-main/colpali_engine/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/colpali-main/colpali_engine/models/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/models/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b2575064035fd713786886734ecc4f4a6bbfb0f0 Binary files /dev/null and b/colpali-main/colpali_engine/models/__pycache__/__init__.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc b/colpali-main/colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c120dc43b7851c9f6b59991d280525995f1db46 Binary files /dev/null and b/colpali-main/colpali_engine/models/__pycache__/paligemma_colbert_architecture.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/models/clip_baselines.py b/colpali-main/colpali_engine/models/clip_baselines.py new file mode 100644 index 0000000000000000000000000000000000000000..0f256b074a22cc3fe2199b85f3f6f00a1442d6d7 --- /dev/null +++ b/colpali-main/colpali_engine/models/clip_baselines.py @@ -0,0 +1,144 @@ +import os +from typing import Optional + +import torch +from transformers import SiglipModel + + +class SigLIP(SiglipModel): + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + return self.forward_branch(*args, **kwargs) + + def forward_branch( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is not None: + # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components. + + outputs = self.vision_model( + pixel_values=pixel_values.to(dtype=self.dtype), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + else: + outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + embeds = outputs[1] + + # normalized features + embeds = embeds / embeds.norm(p=2, dim=-1, keepdim=True) + return embeds + + +class ColSigLIP(SiglipModel): + def __init__(self, config): + super(ColSigLIP, self).__init__(config=config) + self.dim = 128 + self.custom_vision_proj = torch.nn.Linear(self.config.vision_config.hidden_size, self.dim) + self.custom_text_proj = torch.nn.Linear(self.config.text_config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + return self.forward_branch(*args, **kwargs) + + def forward_branch( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is not None: + # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components. + + outputs = self.vision_model( + pixel_values=pixel_values.to(dtype=self.dtype), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + last_hidden_states = outputs.last_hidden_state + + proj = self.custom_vision_proj(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + + else: + outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_states = outputs.last_hidden_state + + proj = self.custom_text_proj(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * attention_mask.unsqueeze(-1) + + # normalized features + return proj diff --git a/colpali-main/colpali_engine/models/colbert_architectures.py b/colpali-main/colpali_engine/models/colbert_architectures.py new file mode 100644 index 0000000000000000000000000000000000000000..97b0eedcc5dcbeff4f0b7b8dd9a23ba8841ced1f --- /dev/null +++ b/colpali-main/colpali_engine/models/colbert_architectures.py @@ -0,0 +1,177 @@ +from torch import nn +from transformers import ( + BertModel, + BertPreTrainedModel, + CamembertModel, + CamembertPreTrainedModel, + LlamaModel, + LlamaPreTrainedModel, + XLMRobertaModel, + XLMRobertaPreTrainedModel, +) + + +class ColCamembert(CamembertPreTrainedModel): + def __init__(self, config): + super(ColCamembert, self).__init__(config=config) + self.roberta: CamembertPreTrainedModel = CamembertModel(config) + self.dim = 128 + self.linear = nn.Linear(self.roberta.config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Camenbert and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.roberta(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + proj = self.linear(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj + + +class ColXLMRoBERTa(XLMRobertaPreTrainedModel): + def __init__(self, config): + super(ColXLMRoBERTa, self).__init__(config=config) + self.roberta: XLMRobertaPreTrainedModel = XLMRobertaModel(config) + self.dim = 128 + self.linear = nn.Linear(self.roberta.config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Roberta and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.roberta(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + proj = self.linear(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj + + +class BiXLMRoBERTa(XLMRobertaPreTrainedModel): + def __init__(self, config): + super(BiXLMRoBERTa, self).__init__(config=config) + self.roberta: XLMRobertaPreTrainedModel = XLMRobertaModel(config) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Roberta and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.roberta(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + # pooling - mean tokens that have attention mask == 1 + proj = last_hidden_states * kwargs["attention_mask"].unsqueeze(-1) + proj = proj.sum(dim=1) / kwargs["attention_mask"].sum(dim=1, keepdim=True) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + return proj + + +class ColBERT(BertPreTrainedModel): + def __init__(self, config): + super(ColBERT, self).__init__(config=config) + self.bert: BertModel = BertModel(config) + self.dim = 128 + self.linear = nn.Linear(self.bert.config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through BERT and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.bert(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + proj = self.linear(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj + + +class BiBERT(BertPreTrainedModel): + def __init__(self, config): + super(BiBERT, self).__init__(config=config) + self.bert: BertModel = BertModel(config) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through BERT and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.bert(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + # pooling - mean tokens that have attention mask == 1 + proj = last_hidden_states * kwargs["attention_mask"].unsqueeze(-1) + proj = proj.sum(dim=1) / kwargs["attention_mask"].sum(dim=1, keepdim=True) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + return proj + + +class ColLlama(LlamaPreTrainedModel): + def __init__(self, config): + super(ColLlama, self).__init__(config=config) + self.model: LlamaModel = LlamaModel(config) + self.dim = 128 + self.linear = nn.Linear(self.model.config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.model(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + proj = self.linear(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj diff --git a/colpali-main/colpali_engine/models/idefics_colbert_architecture.py b/colpali-main/colpali_engine/models/idefics_colbert_architecture.py new file mode 100644 index 0000000000000000000000000000000000000000..0fe67163d572f6c91d2840b0a43ccb9d29f4bdd3 --- /dev/null +++ b/colpali-main/colpali_engine/models/idefics_colbert_architecture.py @@ -0,0 +1,57 @@ +from torch import nn +from transformers import Idefics2Model, Idefics2PreTrainedModel + + +class BiIdefics(Idefics2PreTrainedModel): + def __init__(self, config): + super(BiIdefics, self).__init__(config=config) + self.model: Idefics2Model = Idefics2Model(config) + self.pooling_strategy = "last" + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.model(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + # pooling - last token + proj = last_hidden_states[:, -1, :] + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + return proj + + +class ColIdefics(Idefics2PreTrainedModel): + def __init__(self, config): + super(ColIdefics, self).__init__(config=config) + self.model: Idefics2Model = Idefics2Model(config) + self.dim = 128 + self.linear = nn.Linear(self.model.config.text_config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.model(*args, **kwargs) + last_hidden_states = outputs[0] # (batch_size, sequence_length, hidden_size) + proj = self.linear(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj diff --git a/colpali-main/colpali_engine/models/paligemma_colbert_architecture.py b/colpali-main/colpali_engine/models/paligemma_colbert_architecture.py new file mode 100644 index 0000000000000000000000000000000000000000..b167c6a437cfba550eb304775c7acd8d46c1e579 --- /dev/null +++ b/colpali-main/colpali_engine/models/paligemma_colbert_architecture.py @@ -0,0 +1,191 @@ +import torch +from torch import nn +from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration, PaliGemmaPreTrainedModel + + +class BiPaliLast(PaliGemmaPreTrainedModel): + def __init__(self, config): + super(BiPaliLast, self).__init__(config=config) + self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config) + self.pooling_strategy = "last" + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.model(*args, output_hidden_states=True, **kwargs) + last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + # pooling - last token + proj = last_hidden_states[:, -1, :] + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + return proj + + +class BiPaliMean(PaliGemmaPreTrainedModel): + def __init__(self, config): + super(BiPaliMean, self).__init__(config=config) + self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config) + self.pooling_strategy = "mean" + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.model(*args, output_hidden_states=True, **kwargs) + last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + # pooling -mean on attention mask==1 + proj = torch.sum(last_hidden_states * kwargs["attention_mask"].unsqueeze(-1), dim=1) / torch.sum( + kwargs["attention_mask"], dim=1, keepdim=True + ) + proj = proj / proj.norm(dim=-1, keepdim=True) + return proj + + +class ColPali(PaliGemmaPreTrainedModel): + def __init__(self, config): + super(ColPali, self).__init__(config=config) + self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config) + self.dim = 128 + self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + outputs = self.model(*args, output_hidden_states=True, **kwargs) + last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + proj = self.custom_text_proj(last_hidden_states) + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj + + +class ColNewSiglip(PaliGemmaPreTrainedModel): + def __init__(self, config): + super(ColNewSiglip, self).__init__(config=config) + self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config) + self.dim = 128 + self.custom_image_proj = nn.Linear(self.model.config.vision_config.projection_dim, self.dim) + self.custom_text_proj = nn.Linear(self.model.config.text_config.hidden_size, self.dim) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + # outputs = self.model(*args, output_hidden_states=True, **kwargs) + if "pixel_values" in kwargs: + image_features = self.vision_model_output(*args, **kwargs) + # print(f"Doc: {image_features.shape}") + proj = self.custom_image_proj(image_features) + # print(f"Doc proj: {proj.shape}") + proj = proj / proj.norm(dim=-1, keepdim=True) + else: + outputs = self.model(*args, output_hidden_states=True, **kwargs) + last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + # print(f"Query: {last_hidden_states.shape}") + proj = self.custom_text_proj(last_hidden_states) + # print(f"Query proj: {proj.shape}") + # normalize l2 norm + proj = proj / proj.norm(dim=-1, keepdim=True) + proj = proj * kwargs["attention_mask"].unsqueeze(-1) + return proj + + def vision_model_output(self, input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, **kwargs): + + inputs_embeds = self.model.get_input_embeddings()(input_ids) + # 2. Merge text and images + if pixel_values is not None and input_ids.shape[1] != 1: + image_outputs = self.model.vision_tower(pixel_values.to(inputs_embeds.dtype)) + selected_image_feature = image_outputs.last_hidden_state + image_features = self.model.multi_modal_projector(selected_image_feature) + + return image_features + + raise ValueError("pixel_values is None or input_ids.shape[1] == 1") + + +class BiNewSiglip(PaliGemmaPreTrainedModel): + def __init__(self, config): + super(BiNewSiglip, self).__init__(config=config) + self.model: PaliGemmaForConditionalGeneration = PaliGemmaForConditionalGeneration(config) + self.main_input_name = "doc_input_ids" + + def forward(self, *args, **kwargs): + """ + Forward pass through Llama and the linear layer for dimensionality reduction + + Args: + - input_ids (torch.LongTensor): The input tokens tensor. + - attention_mask (torch.LongTensor): The attention mask tensor. + + Returns: + - torch.Tensor: Embeddings of shape (batch_size, num_tokens, dim) + """ + # outputs = self.model(*args, output_hidden_states=True, **kwargs) + if "pixel_values" in kwargs: + image_features = self.vision_model_output(*args, **kwargs) + # print(f"Doc: {image_features.shape}") + # pool image features + proj = torch.mean(image_features, dim=1) + # print(f"Doc proj: {proj.shape}") + norm = proj.norm(dim=-1, keepdim=True) + proj = proj / norm + else: + outputs = self.model(*args, output_hidden_states=True, **kwargs) + last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size) + # pooling -mean on attention mask==1 + + proj = torch.sum(last_hidden_states * kwargs["attention_mask"].unsqueeze(-1), dim=1) / torch.sum( + kwargs["attention_mask"], dim=1, keepdim=True + ) + # print(f"Query proj: {proj.shape}") + norm = proj.norm(dim=-1, keepdim=True) + proj = proj / norm + return proj + + def vision_model_output(self, input_ids: torch.LongTensor = None, pixel_values: torch.FloatTensor = None, **kwargs): + + inputs_embeds = self.model.get_input_embeddings()(input_ids) + # 2. Merge text and images + if pixel_values is not None and input_ids.shape[1] != 1: + image_outputs = self.model.vision_tower(pixel_values.to(inputs_embeds.dtype)) + selected_image_feature = image_outputs.last_hidden_state + image_features = self.model.multi_modal_projector(selected_image_feature) + + return image_features + + raise ValueError("pixel_values is None or input_ids.shape[1] == 1") diff --git a/colpali-main/colpali_engine/trainer/__init__.py b/colpali-main/colpali_engine/trainer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/colpali-main/colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..de280e48c5c65802553843c15bc0a3926ad87f1a Binary files /dev/null and b/colpali-main/colpali_engine/trainer/__pycache__/__init__.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc b/colpali-main/colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e512781d51b374d7d86bf21c11e830cfe7a6ed5f Binary files /dev/null and b/colpali-main/colpali_engine/trainer/__pycache__/retrieval_evaluator.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/trainer/contrastive_trainer.py b/colpali-main/colpali_engine/trainer/contrastive_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a1126ba2298d3f3b41937b7b85bf8f641375025c --- /dev/null +++ b/colpali-main/colpali_engine/trainer/contrastive_trainer.py @@ -0,0 +1,64 @@ +import torch +from transformers import Trainer + + +class ContrastiveTrainer(Trainer): + def __init__(self, loss_func, is_vision_model, *args, **kwargs): + super().__init__(*args, **kwargs) + self.loss_func = loss_func + self.is_vision_model = is_vision_model + + def compute_loss(self, model, inputs, return_outputs=False): + query_outputs = model(input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"]) + if self.is_vision_model: + if "doc_pixel_attention_mask" not in inputs: + doc_outputs = model( + input_ids=inputs["doc_input_ids"], + attention_mask=inputs["doc_attention_mask"], + pixel_values=inputs["doc_pixel_values"], + ) + else: + doc_outputs = model( + input_ids=inputs["doc_input_ids"], + attention_mask=inputs["doc_attention_mask"], + pixel_values=inputs["doc_pixel_values"], + pixel_attention_mask=inputs["doc_pixel_attention_mask"], + ) + else: + doc_outputs = model(input_ids=inputs["doc_input_ids"], attention_mask=inputs["doc_attention_mask"]) + + loss = self.loss_func(query_outputs, doc_outputs) + return (loss, (query_outputs, doc_outputs)) if return_outputs else loss + + def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=True): + """This function is used to generate predictions and return the loss for the given inputs.""" + if not prediction_loss_only: + raise ValueError("prediction_step is only called with prediction_loss_only=True") + + with torch.no_grad(): + if self.is_vision_model: + if "doc_pixel_attention_mask" not in inputs: + doc_outputs = model( + input_ids=inputs["doc_input_ids"], + attention_mask=inputs["doc_attention_mask"], + pixel_values=inputs["doc_pixel_values"], + ) + else: + doc_outputs = model( + input_ids=inputs["doc_input_ids"], + attention_mask=inputs["doc_attention_mask"], + pixel_values=inputs["doc_pixel_values"], + pixel_attention_mask=inputs["doc_pixel_attention_mask"], + ) + query_outputs = model( + input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"] + ) + else: + + query_outputs = model( + input_ids=inputs["query_input_ids"], attention_mask=inputs["query_attention_mask"] + ) + doc_outputs = model(input_ids=inputs["doc_input_ids"], attention_mask=inputs["doc_attention_mask"]) + + loss = self.loss_func(query_outputs, doc_outputs) + return loss, None, None diff --git a/colpali-main/colpali_engine/trainer/retrieval_evaluator.py b/colpali-main/colpali_engine/trainer/retrieval_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..023cc4d0d9f934266fadeeb811253b98146d408f --- /dev/null +++ b/colpali-main/colpali_engine/trainer/retrieval_evaluator.py @@ -0,0 +1,72 @@ +import torch +from mteb.evaluation.evaluators import RetrievalEvaluator + + +class CustomEvaluator: + def __init__(self, is_multi_vector=False): + self.is_multi_vector = is_multi_vector + self.mteb_evaluator = RetrievalEvaluator() + + def evaluate(self, qs, ps): + if self.is_multi_vector: + scores = self.evaluate_colbert(qs, ps) + else: + scores = self.evaluate_biencoder(qs, ps) + + assert scores.shape[0] == len(qs) + + arg_score = scores.argmax(dim=1) + # compare to arange + accuracy = (arg_score == torch.arange(scores.shape[0], device=scores.device)).sum().item() / scores.shape[0] + print(arg_score) + print(f"Top 1 Accuracy (verif): {accuracy}") + + # cast to numpy + # scores = scores.cpu().numpy() + scores = scores.to(torch.float32).cpu().numpy() + return scores + + def compute_metrics(self, relevant_docs, results, **kwargs): + # wrap mteb package + + ndcg, _map, recall, precision, naucs = self.mteb_evaluator.evaluate( + relevant_docs, + results, + self.mteb_evaluator.k_values, + ignore_identical_ids=kwargs.get("ignore_identical_ids", True), + ) + mrr = self.mteb_evaluator.evaluate_custom(relevant_docs, results, self.mteb_evaluator.k_values, "mrr") + scores = { + **{f"ndcg_at_{k.split('@')[1]}": v for (k, v) in ndcg.items()}, + **{f"map_at_{k.split('@')[1]}": v for (k, v) in _map.items()}, + **{f"recall_at_{k.split('@')[1]}": v for (k, v) in recall.items()}, + **{f"precision_at_{k.split('@')[1]}": v for (k, v) in precision.items()}, + **{f"mrr_at_{k.split('@')[1]}": v for (k, v) in mrr[0].items()}, + **{f"naucs_at_{k.split('@')[1]}": v for (k, v) in naucs.items()}, + } + return scores + + def evaluate_colbert(self, qs, ps, batch_size=128) -> torch.Tensor: + scores = [] + for i in range(0, len(qs), batch_size): + scores_batch = [] + qs_batch = torch.nn.utils.rnn.pad_sequence(qs[i : i + batch_size], batch_first=True, padding_value=0).to( + "cpu" + ) + for j in range(0, len(ps), batch_size): + ps_batch = torch.nn.utils.rnn.pad_sequence( + ps[j : j + batch_size], batch_first=True, padding_value=0 + ).to("cpu") + scores_batch.append(torch.einsum("bnd,csd->bcns", qs_batch, ps_batch).max(dim=3)[0].sum(dim=2)) + scores_batch = torch.cat(scores_batch, dim=1).cpu() + scores.append(scores_batch) + scores = torch.cat(scores, dim=0) + return scores + + def evaluate_biencoder(self, qs, ps) -> torch.Tensor: + + qs = torch.stack(qs) + ps = torch.stack(ps) + + scores = torch.einsum("bd,cd->bc", qs, ps) + return scores diff --git a/colpali-main/colpali_engine/utils/__init__.py b/colpali-main/colpali_engine/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/colpali-main/colpali_engine/utils/__pycache__/__init__.cpython-310.pyc b/colpali-main/colpali_engine/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e656a8123fa410e78425c32f73002247613b9bb Binary files /dev/null and b/colpali-main/colpali_engine/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc b/colpali-main/colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f127c2e9ebbd0543ac577b9b4065471f78b6cbb Binary files /dev/null and b/colpali-main/colpali_engine/utils/__pycache__/colpali_processing_utils.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc b/colpali-main/colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1bd4cd125a9f3eb8bc3d44ba713cc3273663d0b0 Binary files /dev/null and b/colpali-main/colpali_engine/utils/__pycache__/image_from_page_utils.cpython-310.pyc differ diff --git a/colpali-main/colpali_engine/utils/colidefics_processing_utils.py b/colpali-main/colpali_engine/utils/colidefics_processing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..2d19b4b145cb5409e71899265b8c2c1145d8a018 --- /dev/null +++ b/colpali-main/colpali_engine/utils/colidefics_processing_utils.py @@ -0,0 +1,53 @@ +# Utils for processing images and queries for ColPaLi + +def process_images(processor, images, max_length: int = 50): + texts_doc = [] + images = [image.convert("RGB") for image in images] + + for _ in images: + messages_doc = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe the image."}, + {"type": "image"}, + ], + }, + ] + + text_doc = processor.apply_chat_template(messages_doc, add_generation_prompt=False) + texts_doc.append(text_doc.strip()) + + batch_doc = processor( + text=texts_doc, + images=images, + return_tensors="pt", + padding="longest", + ) + return batch_doc + + +def process_queries(processor, queries, mock_image, max_length: int = 50): + texts_query = [] + for query in queries: + messages_query = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Question: {query}", + }, + ], + }, + ] + text_query = processor.apply_chat_template(messages_query, add_generation_prompt=False).strip() + texts_query.append(text_query) + + batch_query = processor( + text=texts_query, + return_tensors="pt", + padding="longest", + max_length=max_length, + ) + return batch_query diff --git a/colpali-main/colpali_engine/utils/colpali_processing_utils.py b/colpali-main/colpali_engine/utils/colpali_processing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4d2e577f5f7899712db5c3ebeef37c531192e89a --- /dev/null +++ b/colpali-main/colpali_engine/utils/colpali_processing_utils.py @@ -0,0 +1,36 @@ +# Utils for processing images and queries for ColPaLi + + +def process_images(processor, images, max_length: int = 50): + texts_doc = ["Describe the image."] * len(images) + images = [image.convert("RGB") for image in images] + + batch_doc = processor( + text=texts_doc, + images=images, + return_tensors="pt", + padding="longest", + max_length=max_length + processor.image_seq_length, + ) + return batch_doc + + +def process_queries(processor, queries, mock_image, max_length: int = 50): + texts_query = [] + for query in queries: + query = f"Question: {query}" + texts_query.append(query) + + batch_query = processor( + images=[mock_image.convert("RGB")] * len(texts_query), + # NOTE: the image is not used in batch_query but it is required for calling the processor + text=texts_query, + return_tensors="pt", + padding="longest", + max_length=max_length + processor.image_seq_length, + ) + del batch_query["pixel_values"] + + batch_query["input_ids"] = batch_query["input_ids"][..., processor.image_seq_length :] + batch_query["attention_mask"] = batch_query["attention_mask"][..., processor.image_seq_length :] + return batch_query diff --git a/colpali-main/colpali_engine/utils/dataset_transformation.py b/colpali-main/colpali_engine/utils/dataset_transformation.py new file mode 100644 index 0000000000000000000000000000000000000000..8a328198abdbf25312092371a5492b34bfe66fcd --- /dev/null +++ b/colpali-main/colpali_engine/utils/dataset_transformation.py @@ -0,0 +1,158 @@ +import os + +from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset + +USE_LOCAL_DATASET = os.environ.get("USE_LOCAL_DATASET", "1") == "1" + + +def add_metadata_column(dataset, column_name, value): + def add_source(example): + example[column_name] = value + return example + + return dataset.map(add_source) + + +def load_train_set() -> DatasetDict: + + ds_paths = [ + "infovqa_train", + "docvqa_train", + "arxivqa_train", + "tatdqa_train", + "syntheticDocQA_government_reports_train", + "syntheticDocQA_healthcare_industry_train", + "syntheticDocQA_artificial_intelligence_train", + "syntheticDocQA_energy_train", + ] + base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/" + ds_tot = [] + for path in ds_paths: + cpath = base_path + path + ds = load_dataset(cpath, split="train") + if "arxivqa" in path: + # subsample 10k + ds = ds.shuffle(42).select(range(10000)) + ds_tot.append(ds) + + dataset = concatenate_datasets(ds_tot) + dataset = dataset.shuffle(seed=42) + # split into train and test + dataset_eval = dataset.select(range(500)) + dataset = dataset.select(range(500, len(dataset))) + ds_dict = DatasetDict({"train": dataset, "test": dataset_eval}) + return ds_dict + + +def load_train_set_with_tabfquad() -> DatasetDict: + + ds_paths = [ + "infovqa_train", + "docvqa_train", + "arxivqa_train", + "tatdqa_train", + "tabfquad_train_subsampled", + "syntheticDocQA_government_reports_train", + "syntheticDocQA_healthcare_industry_train", + "syntheticDocQA_artificial_intelligence_train", + "syntheticDocQA_energy_train", + ] + base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/" + ds_tot = [] + for path in ds_paths: + cpath = base_path + path + ds = load_dataset(cpath, split="train") + if "arxivqa" in path: + # subsample 10k + ds = ds.shuffle(42).select(range(10000)) + ds_tot.append(ds) + + dataset = concatenate_datasets(ds_tot) + dataset = dataset.shuffle(seed=42) + # split into train and test + dataset_eval = dataset.select(range(500)) + dataset = dataset.select(range(500, len(dataset))) + ds_dict = DatasetDict({"train": dataset, "test": dataset_eval}) + return ds_dict + + +def load_train_set_with_docmatix() -> DatasetDict: + + ds_paths = [ + "infovqa_train", + "docvqa_train", + "arxivqa_train", + "tatdqa_train", + "tabfquad_train_subsampled", + "syntheticDocQA_government_reports_train", + "syntheticDocQA_healthcare_industry_train", + "syntheticDocQA_artificial_intelligence_train", + "syntheticDocQA_energy_train", + "Docmatix_filtered_train", + ] + base_path = "./data_dir/" if USE_LOCAL_DATASET else "vidore/" + ds_tot = [] + for path in ds_paths: + cpath = base_path + path + ds = load_dataset(cpath, split="train") + if "arxivqa" in path: + # subsample 10k + ds = ds.shuffle(42).select(range(10000)) + ds_tot.append(ds) + + dataset = concatenate_datasets(ds_tot) + dataset = dataset.shuffle(seed=42) + # split into train and test + dataset_eval = dataset.select(range(500)) + dataset = dataset.select(range(500, len(dataset))) + ds_dict = DatasetDict({"train": dataset, "test": dataset_eval}) + return ds_dict + + +def load_docvqa_dataset() -> DatasetDict: + if USE_LOCAL_DATASET: + dataset_doc = load_dataset("./data_dir/DocVQA", "DocVQA", split="validation") + dataset_doc_eval = load_dataset("./data_dir/DocVQA", "DocVQA", split="test") + dataset_info = load_dataset("./data_dir/DocVQA", "InfographicVQA", split="validation") + dataset_info_eval = load_dataset("./data_dir/DocVQA", "InfographicVQA", split="test") + else: + dataset_doc = load_dataset("lmms-lab/DocVQA", "DocVQA", split="validation") + dataset_doc_eval = load_dataset("lmms-lab/DocVQA", "DocVQA", split="test") + dataset_info = load_dataset("lmms-lab/DocVQA", "InfographicVQA", split="validation") + dataset_info_eval = load_dataset("lmms-lab/DocVQA", "InfographicVQA", split="test") + + # concatenate the two datasets + dataset = concatenate_datasets([dataset_doc, dataset_info]) + dataset_eval = concatenate_datasets([dataset_doc_eval, dataset_info_eval]) + # sample 100 from eval dataset + dataset_eval = dataset_eval.shuffle(seed=42).select(range(200)) + + # rename question as query + dataset = dataset.rename_column("question", "query") + dataset_eval = dataset_eval.rename_column("question", "query") + + # create new column image_filename that corresponds to ucsf_document_id if not None, else image_url + dataset = dataset.map( + lambda x: {"image_filename": x["ucsf_document_id"] if x["ucsf_document_id"] is not None else x["image_url"]} + ) + dataset_eval = dataset_eval.map( + lambda x: {"image_filename": x["ucsf_document_id"] if x["ucsf_document_id"] is not None else x["image_url"]} + ) + + ds_dict = DatasetDict({"train": dataset, "test": dataset_eval}) + + return ds_dict + + +class TestSetFactory: + def __init__(self, dataset_path): + self.dataset_path = dataset_path + + def __call__(self, *args, **kwargs): + dataset = load_dataset(self.dataset_path, split="test") + return dataset + + +if __name__ == "__main__": + ds = TestSetFactory("vidore/tabfquad_test_subsampled")() + print(ds) diff --git a/colpali-main/colpali_engine/utils/gpu_stats.py b/colpali-main/colpali_engine/utils/gpu_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..4d3efa36bd9c5ff781a9afd798bada3d095a538e --- /dev/null +++ b/colpali-main/colpali_engine/utils/gpu_stats.py @@ -0,0 +1,24 @@ +# cond import +try: + from pynvml import * + + def print_gpu_utilization(): + nvmlInit() + handle = nvmlDeviceGetHandleByIndex(0) + info = nvmlDeviceGetMemoryInfo(handle) + print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.") + + def print_summary(result): + print(f"Time: {result.metrics['train_runtime']:.2f}") + print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") + print_gpu_utilization() + +except ImportError: + print("pynvml not found. GPU stats will not be printed.") + + def print_summary(result): + print(f"Time: {result.metrics['train_runtime']:.2f}") + print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") + + def print_gpu_utilization(): + pass diff --git a/colpali-main/colpali_engine/utils/image_from_page_utils.py b/colpali-main/colpali_engine/utils/image_from_page_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6bcc158b5ef1fe675a885a812f0abd2748101486 --- /dev/null +++ b/colpali-main/colpali_engine/utils/image_from_page_utils.py @@ -0,0 +1,21 @@ +import requests +from PIL import Image + + +def load_from_pdf(pdf_path: str): + from pdf2image import convert_from_path + + images = convert_from_path(pdf_path) + return images + + +def load_from_image_urls(urls: str): + images = [Image.open(requests.get(url, stream=True).raw) for url in urls] + return images + + +def load_from_dataset(dataset): + from datasets import load_dataset + + dataset = load_dataset(dataset, split="test") + return dataset["image"] diff --git a/colpali-main/colpali_engine/utils/image_utils.py b/colpali-main/colpali_engine/utils/image_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..61a31c236b33a65f8474363b3d058c6b29718482 --- /dev/null +++ b/colpali-main/colpali_engine/utils/image_utils.py @@ -0,0 +1,64 @@ +""" +Utility functions for working with images. +""" + +import base64 +import io + +from PIL import Image + + +def scale_image(image: Image.Image, new_height: int = 1024) -> Image.Image: + """ + Scale an image to a new height while maintaining the aspect ratio. + """ + # Calculate the scaling factor + width, height = image.size + aspect_ratio = width / height + new_width = int(new_height * aspect_ratio) + + # Resize the image + scaled_image = image.resize((new_width, new_height)) + + return scaled_image + + +def scale_to_max_dimension(image: Image.Image, max_dimension: int = 1024) -> Image.Image: + """ + Scale an image to a maximum dimension while maintaining the aspect ratio. + """ + # Get the dimensions of the image + width, height = image.size + + max_original_dimension = max(width, height) + + if max_original_dimension < max_dimension: + return image + + # Calculate the scaling factor + aspect_ratio = max_dimension / max_original_dimension + new_width = int(width * aspect_ratio) + new_height = int(height * aspect_ratio) + + # Resize the image + scaled_image = image.resize((new_width, new_height)) + + return scaled_image + + +def get_base64_image(img: str | Image.Image, add_url_prefix: bool = True) -> str: + """ + Convert an image (from a filepath or a PIL.Image object) to a JPEG-base64 string. + """ + if isinstance(img, str): + img = Image.open(img) + elif isinstance(img, Image.Image): + pass + else: + raise ValueError("`img` must be a path to an image or a PIL Image object.") + + buffered = io.BytesIO() + img.save(buffered, format="jpeg") + b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8") + + return f"data:image/jpeg;base64,{b64_data}" if add_url_prefix else b64_data diff --git a/colpali-main/colpali_engine/utils/iter_utils.py b/colpali-main/colpali_engine/utils/iter_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bef1792045e5c973d07049e66838acc8f8bad7f6 --- /dev/null +++ b/colpali-main/colpali_engine/utils/iter_utils.py @@ -0,0 +1,42 @@ +import sys + + +def islice(iterable, *args): + """ + Yield a slice of an iterable. + >>> islice('ABCDEFG', 2) → A B + >>> islice('ABCDEFG', 2, 4) → C D + >>> islice('ABCDEFG', 2, None) → C D E F G + >>> islice('ABCDEFG', 0, None, 2) → A C E G + """ + s = slice(*args) + start, stop, step = s.start or 0, s.stop or sys.maxsize, s.step or 1 + it = iter(range(start, stop, step)) + try: + nexti = next(it) + except StopIteration: + # Consume *iterable* up to the *start* position. + for i, element in zip(range(start), iterable): + pass + return + try: + for i, element in enumerate(iterable): + if i == nexti: + yield element + nexti = next(it) + except StopIteration: + # Consume to *stop*. + for i, element in zip(range(i + 1, stop), iterable): + pass + + +def batched(iterable, n: int): + """ + Yield batches of n elements from an iterable. + >>> batched('ABCDEFG', 3) → ABC DEF G + """ + if n < 1: + raise ValueError("n must be at least one") + it = iter(iterable) + while batch := tuple(islice(it, n)): + yield batch diff --git a/colpali-main/colpali_engine/utils/pdf_utils.py b/colpali-main/colpali_engine/utils/pdf_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ea07bec45b03afc48aec086a7a783437ececed75 --- /dev/null +++ b/colpali-main/colpali_engine/utils/pdf_utils.py @@ -0,0 +1,87 @@ +import glob +import os +import random +from pathlib import Path + +from pdf2image import convert_from_path +from tqdm import tqdm + +random.seed(42) + + +def convert_pdf_to_images(pdf_file: str, save_folder: str): + """ + Convert each page of a pdf to a jpg image and save them in a folder. + + Args: + - pdf_file (str): path to the pdf file + - save_folder (str): path to the folder where the images will be saved + + """ + images = convert_from_path(pdf_file) + + for i, image in enumerate(images): + if not os.path.exists(save_folder): + os.makedirs(save_folder) + image.save(os.path.join(save_folder, f"page_{i+1}.jpg"), "JPEG") + + +def convert_all_pdfs_to_images(path_to_folder: str, n_samples: int = 0): + """ + Convert all pdfs in a folder and its subfolder to images and save them in a folder. + It will sample n_samples pdf files in each subfolder, allowing to have granularity on the number of pdf files to convert. + + + Args: + - path_to_folder (str): path to the folder containing the pdf files + - n_samples (int): number of pdf files to sample in each subfolder + + directory structure: + - path_to_folder + - subfolder1 + - pdf1 + - pdf2 + - ... + - subfolder2 + - pdf1 + - pdf2 + - ... + - ... + + """ + # take n_samples pdf files in each subfolder : I want to take 10 pdf files from each subfolder + sub_dirs = [d for d in os.listdir(path_to_folder) if os.path.isdir(os.path.join(path_to_folder, d))] + + sampled_files = [] + + for sub_dir in sub_dirs: + pdf_files = glob.glob(os.path.join(path_to_folder, sub_dir, "*.pdf")) + + if (n_samples == 0) or (len(pdf_files) <= n_samples): + print(f"Taking all pdf files in {sub_dir}") + sampled_files.extend(pdf_files) + + else: + print(f"Taking {n_samples} pdf files in {sub_dir}") + sampled_files.extend(random.sample(pdf_files, n_samples)) + + pdf_files = [str(file) for file in sampled_files] + + # Create an empty text file that will contain the file paths of the corrupted pdf files + dirpath_corrupted = Path(path_to_folder) / "corrupted_pdf_files.txt" + dirpath_corrupted.parent.mkdir(parents=True, exist_ok=True) + + with dirpath_corrupted.open("w") as f: + with tqdm(total=len(pdf_files)) as pbar: + for pdf_file in pdf_files: + pbar.set_description(f"Processing {pdf_file}") + save_folder = os.path.join("pages_extracted", *Path(pdf_file).parts[-2:]) + if not os.path.exists(os.path.join(path_to_folder, save_folder)): + try: + convert_pdf_to_images(pdf_file, os.path.join(path_to_folder, save_folder)) + except Exception as e: + print(f"Error converting {pdf_file}: {e}") + f.write(pdf_file) + f.write("\n") + pbar.update(1) + return diff --git a/colpali-main/colpali_engine/utils/plot_utils.py b/colpali-main/colpali_engine/utils/plot_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9e0c414ddca8cad08ae292f219eac43356008719 --- /dev/null +++ b/colpali-main/colpali_engine/utils/plot_utils.py @@ -0,0 +1,6 @@ +import seaborn as sns + + +def setup_seaborn(): + sns.set_style("white") + sns.set_context("paper", font_scale=2) diff --git a/colpali-main/colpali_engine/utils/torch_utils.py b/colpali-main/colpali_engine/utils/torch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..45c6513327a5be87cdc4c3af877fa60a0e5d6dc8 --- /dev/null +++ b/colpali-main/colpali_engine/utils/torch_utils.py @@ -0,0 +1,18 @@ +""" +Utility functions for interpretability. +""" + +import torch + + +def get_torch_device() -> str: + """ + Returns the device and dtype to be used for torch tensors. + """ + if torch.cuda.is_available(): + device = "cuda:0" + elif torch.backends.mps.is_available(): # for Apple Silicon + device = "mps" + else: + device = "cpu" + return device diff --git a/colpali-main/colpali_engine/utils/train_colpali_engine_models.py b/colpali-main/colpali_engine/utils/train_colpali_engine_models.py new file mode 100644 index 0000000000000000000000000000000000000000..65555273f80d56f495d1e0356edd6c21d0a63ff8 --- /dev/null +++ b/colpali-main/colpali_engine/utils/train_colpali_engine_models.py @@ -0,0 +1,247 @@ +# HuggingFace trainer +import json +import os +from dataclasses import dataclass +from typing import Callable, Dict, Optional + +import torch +from datasets import concatenate_datasets +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoTokenizer, Idefics2Processor, PreTrainedModel, PreTrainedTokenizer, TrainingArguments + +from colpali_engine.dataset.custom_collator import CustomCollator +from colpali_engine.loss.colbert_loss import BiEncoderLoss, BiPairwiseCELoss, ColbertLoss, ColbertPairwiseCELoss +from colpali_engine.trainer.contrastive_trainer import ContrastiveTrainer +from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator +from colpali_engine.utils.gpu_stats import print_gpu_utilization, print_summary + + +@dataclass +class ColModelTrainingConfig: + model: PreTrainedModel + tr_args: TrainingArguments = None + output_dir: str = None + max_length: int = 256 + run_eval: bool = True + run_train: bool = True + peft_config: Optional[LoraConfig] = None + add_suffix: bool = False + processor: Idefics2Processor = None + tokenizer: PreTrainedTokenizer = None + loss_func: Optional[Callable] = ColbertLoss() + dataset_loading_func: Optional[Callable] = None + eval_dataset_loader: Optional[Dict[str, Callable]] = None + pretrained_peft_model_name_or_path: Optional[str] = None + + def __post_init__(self): + if self.output_dir is None: + sanitized_name = str(self.model.name_or_path).replace("/", "_") + self.output_dir = f"./models/{sanitized_name}" + + if self.tr_args is None: + self.tr_args = TrainingArguments(output_dir=self.output_dir) + elif self.tr_args.output_dir is None: + self.tr_args.output_dir = self.output_dir + + # cast if string + if isinstance(self.tr_args.learning_rate, str): + self.tr_args.learning_rate = float(self.tr_args.learning_rate) + self.tr_args.remove_unused_columns = False + + if self.processor is None and self.tokenizer is None: + print("Using textual model tokenization") + self.tokenizer = AutoTokenizer.from_pretrained(self.model.name_or_path) + + if self.pretrained_peft_model_name_or_path is not None: + self.model.load_adapter(self.pretrained_peft_model_name_or_path) + print(f"Loaded pretrained adapter from {self.pretrained_peft_model_name_or_path}") + + if self.peft_config is not None: + print("Configurating PEFT model") + if self.processor is None: + # Might be deprecated - use the "else" branch + self.model = prepare_model_for_kbit_training(self.model) # use_gradient_checkpointing=True + # self.model.enable_input_require_grads() + self.model = get_peft_model(self.model, self.peft_config) + self.model.print_trainable_parameters() + else: + # Ugly debugging hack + # if self.model.model.config.text_config.vocab_size == 32000: + # print("DEBUG: Resizing token embeddings - This should not happen in a real scenario!") + # self.model.model.text_model.resize_token_embeddings(32003) + # self.model.model.vision_model.encoder.layers = self.model.model.vision_model.encoder.layers[0:2] + # self.model.enable_input_require_grads() + if self.pretrained_peft_model_name_or_path is None: + self.model.add_adapter(self.peft_config) + self.model.enable_adapters() + else: + print(f"Adapter already loaded from {self.pretrained_peft_model_name_or_path}. Not overwriting.") + + print_gpu_utilization() + + +class ColModelTraining: + def __init__(self, config: ColModelTrainingConfig) -> None: + self.config = config + self.model = self.config.model + self.dataset = self.config.dataset_loading_func() + self.collator = CustomCollator( + processor=self.config.processor, tokenizer=self.config.tokenizer, max_length=self.config.max_length + ) + self.current_git_hash = os.popen("git rev-parse HEAD").read().strip() + self.retriever_evaluator = CustomEvaluator( + is_multi_vector=( + isinstance(self.config.loss_func, ColbertLoss) + or isinstance(self.config.loss_func, ColbertPairwiseCELoss) + ) + ) + + def train(self) -> None: + + trainer = ContrastiveTrainer( + model=self.model, + train_dataset=self.dataset["train"], + eval_dataset=self.dataset["test"], + args=self.config.tr_args, + data_collator=self.collator, + loss_func=self.config.loss_func, + is_vision_model=self.config.processor is not None, + ) + trainer.args.remove_unused_columns = False + + result = trainer.train() + print_summary(result) + + def eval_dataset(self, test_dataset): + + self.model.eval() + + # # debug + # if len(test_dataset) > 200: + # test_dataset = test_dataset.select(range(0, 100)) + + idx_with_query = [idx for idx, sample in enumerate(test_dataset["query"]) if sample is not None] + idx_without_query = [idx for idx, sample in enumerate(test_dataset["query"]) if sample is None] + + dataloader_with_query = DataLoader( + test_dataset.select(idx_with_query), + batch_size=self.config.tr_args.per_device_eval_batch_size, + shuffle=False, + collate_fn=self.collator, + ) + dataloader_without_query = DataLoader( + test_dataset.select(idx_without_query), + batch_size=self.config.tr_args.per_device_eval_batch_size, + shuffle=False, + collate_fn=self.collator, + ) + + # dataset is ordered so that non-null queries come first + test_dataset = concatenate_datasets( + [test_dataset.select(idx_with_query), test_dataset.select(idx_without_query)] + ) + + relevant_docs = {} + docidx_2_docid = {} + qsidx_2_query = [] + for idx, sample in enumerate(test_dataset): + doc_id = sample["image_filename"] if "image_filename" in sample else str(hash(sample["doc"])) + # query_id = sample["query_id"] if "query_id" in sample else str(hash(sample["query"])) + if sample["query"] is not None: + relevant_docs[str(idx)] = {doc_id: 1} + qsidx_2_query.append(str(idx)) + docidx_2_docid[str(idx)] = doc_id + + qs = [] + ps = [] + + device = self.model.device + with (torch.no_grad()): + for dataloader in [dataloader_with_query, dataloader_without_query]: + for batch in tqdm(dataloader): + if "doc_pixel_values" not in batch: + doc = self.model( + input_ids=batch["doc_input_ids"].to(device), + attention_mask=batch["doc_attention_mask"].to(device), + ) + + else: + if "doc_pixel_attention_mask" in batch: + doc = self.model( + input_ids=batch["doc_input_ids"].to(device), + attention_mask=batch["doc_attention_mask"].to(device), + pixel_values=batch["doc_pixel_values"].to(device), + pixel_attention_mask=batch["doc_pixel_attention_mask"].to(device), + ) + else: + doc = self.model( + input_ids=batch["doc_input_ids"].to(device), + attention_mask=batch["doc_attention_mask"].to(device), + pixel_values=batch["doc_pixel_values"].to(device), + ) + + ps.extend(list(torch.unbind(doc.to("cpu")))) + + if "query_input_ids" in batch: + query = self.model( + input_ids=batch["query_input_ids"].to(device), + attention_mask=batch["query_attention_mask"].to(device), + ) + # variable len + qs.extend(list(torch.unbind(query.to("cpu")))) + + print("Embeddings computed, evaluating") + scores = self.retriever_evaluator.evaluate(qs, ps) + # scores is 2d array of shape (n_queries, n_docs) + # turn it into a dict + results = {} + assert scores.shape[0] == len(qsidx_2_query) + for idx, scores_per_query in enumerate(scores): + results[qsidx_2_query[idx]] = { + docidx_2_docid[str(docidx)]: float(score) for docidx, score in enumerate(scores_per_query) + } + + # evaluate + metrics = self.retriever_evaluator.compute_metrics(relevant_docs, results) + print(metrics) + return metrics + + def eval(self) -> None: + + print("Evaluating on validation set") + metrics = self.eval_dataset(self.dataset["test"]) + print(f"Metrics for validation set: {metrics}") + all_metrics = {"validation_set": metrics} + + if self.config.eval_dataset_loader is not None: + for test_name, test_dataset_loading_func in self.config.eval_dataset_loader.items(): + print(f"Evaluating {test_name}") + test_ds = test_dataset_loading_func() + metrics = self.eval_dataset(test_ds) + all_metrics[test_name] = metrics + print(f"Metrics for {test_name}: {metrics}") + + # checkpoint dumps + with open(f"{self.config.output_dir}/results.json", "w") as f: + json.dump(all_metrics, f) + + # save results as json + with open(f"{self.config.output_dir}/results.json", "w") as f: + json.dump(all_metrics, f) + + def save(self, config_file): + # save model + self.model.save_pretrained(self.config.output_dir) + if self.config.tokenizer is not None: + self.config.tokenizer.save_pretrained(self.config.output_dir) + if self.config.processor is not None: + self.config.processor.save_pretrained(self.config.output_dir) # save config + + # copy-paste the yml file with os + os.system(f"cp {config_file} {self.config.output_dir}/training_config.yml") + + # save git hash of the commit at beginning of training + with open(f"{self.config.output_dir}/git_hash.txt", "w") as f: + f.write(self.current_git_hash) diff --git a/colpali-main/colpali_engine/utils/wrapper.py b/colpali-main/colpali_engine/utils/wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..fe6202858c271c17eccb5b761550682270749aee --- /dev/null +++ b/colpali-main/colpali_engine/utils/wrapper.py @@ -0,0 +1,83 @@ +import importlib + +from colpali_engine.models.clip_baselines import ColSigLIP, SigLIP +from colpali_engine.models.colbert_architectures import ( + BiBERT, + BiXLMRoBERTa, + ColBERT, + ColCamembert, + ColLlama, + ColXLMRoBERTa, +) +from colpali_engine.models.idefics_colbert_architecture import BiIdefics, ColIdefics +from colpali_engine.models.paligemma_colbert_architecture import ( + BiNewSiglip, + BiPaliLast, + BiPaliMean, + ColNewSiglip, + ColPali, +) + +if importlib.util.find_spec("transformers") is not None: + from transformers import AutoProcessor, AutoTokenizer + from transformers.tokenization_utils import PreTrainedTokenizer + + class AutoProcessorWrapper: + def __new__(cls, *args, **kwargs): + return AutoProcessor.from_pretrained(*args, **kwargs) + + class AutoTokenizerWrapper(PreTrainedTokenizer): + def __new__(cls, *args, **kwargs): + return AutoTokenizer.from_pretrained(*args, **kwargs) + + class AutoColModelWrapper: + def __new__(cls, *args, **kwargs): + pretrained_model_name_or_path = None + if args: + pretrained_model_name_or_path = args[0] + elif kwargs: + pretrained_model_name_or_path = kwargs["pretrained_model_name_or_path"] + + training_objective = kwargs.pop("training_objective", "colbertv1") + + if "camembert" in pretrained_model_name_or_path: + return ColCamembert.from_pretrained(*args, **kwargs) + elif "xlm-roberta" in pretrained_model_name_or_path: + if training_objective == "biencoder": + return BiXLMRoBERTa.from_pretrained(*args, **kwargs) + return ColXLMRoBERTa.from_pretrained(*args, **kwargs) + elif ( + "llama" in pretrained_model_name_or_path.lower() or "croissant" in pretrained_model_name_or_path.lower() + ): + return ColLlama.from_pretrained(*args, **kwargs) + elif "idefics2" in pretrained_model_name_or_path: + if training_objective == "biencoder": + return BiIdefics.from_pretrained(*args, **kwargs) + return ColIdefics.from_pretrained(*args, **kwargs) + elif "siglip" in pretrained_model_name_or_path: + if training_objective == "biencoder_mean": + return SigLIP.from_pretrained(*args, **kwargs) + elif training_objective == "colbertv1": + return ColSigLIP.from_pretrained(*args, **kwargs) + else: + raise ValueError(f"Training objective {training_objective} not recognized") + elif "paligemma" in pretrained_model_name_or_path: + if training_objective == "biencoder_mean": + return BiPaliMean.from_pretrained(*args, **kwargs) + elif training_objective == "biencoder_last": + return BiPaliLast.from_pretrained(*args, **kwargs) + elif training_objective == "biencoder_mean_vision": + return BiNewSiglip.from_pretrained(*args, **kwargs) + elif training_objective == "colbertv1_vision": + return ColNewSiglip.from_pretrained(*args, **kwargs) + elif training_objective == "colbertv1": + return ColPali.from_pretrained(*args, **kwargs) + else: + raise ValueError(f"Training objective {training_objective} not recognized") + else: + if training_objective == "biencoder": + return BiBERT.from_pretrained(*args, **kwargs) + return ColBERT.from_pretrained(*args, **kwargs) + +else: + raise ModuleNotFoundError("Transformers must be loaded") diff --git a/colpali-main/demo/README.md b/colpali-main/demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..037b3f142a19eff4ed1c6eb2493b9281170c9dd8 --- /dev/null +++ b/colpali-main/demo/README.md @@ -0,0 +1,6 @@ +--- +title: cvquest-colpali +app_file: app.py +sdk: gradio +sdk_version: 4.39.0 +--- diff --git a/colpali-main/demo/app.py b/colpali-main/demo/app.py new file mode 100644 index 0000000000000000000000000000000000000000..a64a491875b0e2afe815a0166efdd6b7ecf22594 --- /dev/null +++ b/colpali-main/demo/app.py @@ -0,0 +1,99 @@ +import os +import sys + +import gradio as gr +import torch +from pdf2image import convert_from_path +from PIL import Image +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoProcessor + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from colpali_engine.models.paligemma_colbert_architecture import ColPali +from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator +from colpali_engine.utils.colpali_processing_utils import process_images, process_queries + +def search(query: str, ds, images, k): + qs = [] + with torch.no_grad(): + batch_query = process_queries(processor, [query], mock_image) + batch_query = {k: v.to(device) for k, v in batch_query.items()} + embeddings_query = model(**batch_query) + qs.extend(list(torch.unbind(embeddings_query.to("cpu")))) + + # run evaluation + retriever_evaluator = CustomEvaluator(is_multi_vector=True) + scores = retriever_evaluator.evaluate(qs, ds) + + top_k_indices = scores.argsort(axis=1)[0][-k:][::-1] + + results = [] + for idx in top_k_indices: + results.append((images[idx], f"Page {idx}")) + + return results + + #best_page = int(scores.argmax(axis=1).item()) + #return f"The most relevant page is {best_page}", images[best_page] + + +def index(file, ds): + """Example script to run inference with ColPali""" + images = [] + for f in file: + images.extend(convert_from_path(f)) + + # run inference - docs + dataloader = DataLoader( + images, + batch_size=4, + shuffle=False, + collate_fn=lambda x: process_images(processor, x), + ) + for batch_doc in tqdm(dataloader): + with torch.no_grad(): + batch_doc = {k: v.to(device) for k, v in batch_doc.items()} + embeddings_doc = model(**batch_doc) + ds.extend(list(torch.unbind(embeddings_doc.to("cpu")))) + return f"Uploaded and converted {len(images)} pages", ds, images + +COLORS = ["#4285f4", "#db4437", "#f4b400", "#0f9d58", "#e48ef1"] +# Load model +model_name = "vidore/colpali" +token = os.environ.get("HF_TOKEN") +model = ColPali.from_pretrained( + "google/paligemma-3b-mix-448", torch_dtype=torch.bfloat16, device_map="cpu", token=token +).eval() +model.load_adapter(model_name) +processor = AutoProcessor.from_pretrained(model_name, token=token) +device = model.device +mock_image = Image.new("RGB", (448, 448), (255, 255, 255)) + +with gr.Blocks() as demo: + gr.Markdown("# ColPali: Efficient Document Retrieval with Vision Language Models 📚🔍") + gr.Markdown("## 1️⃣ Upload PDFs") + file = gr.File(file_types=["pdf"], file_count="multiple") + + gr.Markdown("## 2️⃣ Convert the PDFs and upload") + convert_button = gr.Button("🔄 Convert and upload") + message = gr.Textbox("Files not yet uploaded") + embeds = gr.State(value=[]) + imgs = gr.State(value=[]) + + # Define the actions + convert_button.click(index, inputs=[file, embeds], outputs=[message, embeds, imgs]) + + gr.Markdown("## 3️⃣ Search") + query = gr.Textbox(placeholder="Enter your query here") + search_button = gr.Button("🔍 Search") + message2 = gr.Textbox("Query not yet set") + output_img = gr.Image() + k = gr.Slider(minimum=1, maximum=10, step=1, label="Number of results", value=5) + + search_button.click(search, inputs=[query, embeds, imgs, k], outputs=[message2, output_img]) + + +if __name__ == "__main__": + demo.queue(max_size=10).launch(debug=True) diff --git a/colpali-main/pyproject.toml b/colpali-main/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..66159e55011f2270865022d484424d7bee4766dd --- /dev/null +++ b/colpali-main/pyproject.toml @@ -0,0 +1,105 @@ +[project] +name = 'colpali_engine' +# dynamic = ["version"] +version = '0.0.1' +description = 'This repository centralizes ressources for the ColPali project.' +authors = [ + { name = 'Manuel Faysse', email = 'manuel.faysse@illuin.tech' }, + { name = 'Hugues Sibille', email = 'hugues.sibille@illuin.tech' }, + { name = 'Tony Wu', email = 'tony.wu@illuin.tech' }, +] +readme = 'README.md' +requires-python = '>=3.9' +classifiers = [ + 'Intended Audience :: Science/Research', + 'Intended Audience :: Developers', + 'Operating System :: OS Independent', + 'Private :: Do Not Upload', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.10', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Typing :: Typed', +] +dependencies = [ + "torch>=2.2.0", + "transformers>=4.41.1", + "mteb>=1.12.22", + "requests", + "GPUtil", + "peft>=0.11.0, <0.12.0", +] + +[project.optional-dependencies] +dev = [ + "black>=24.4.2", + "coverage>=7.5.2", + "ipykernel>=6.29.4", + "mypy>=1.10.0", + "pytest>=8.2.1", + "ruff>=0.4.5", +] + +train = [ + "accelerate==0.30.1", + "configue==5.0.0", + "datasets==2.19.1", + "typer==0.12.3", + "bitsandbytes", +] + +[project.urls] +homepage = "https://github.com/ManuelFay/colpali" + +[build-system] +requires = ['setuptools', 'setuptools_scm[toml]', 'wheel'] +build-backend = 'setuptools.build_meta' + +[tool.setuptools_scm] +fallback_version = '0.0.0-dev' + +[tool.setuptools] +zip-safe = false +platforms = ['any'] + +[tool.setuptools.packages.find] +include = ['colpali_engine', 'colpali_engine.*'] + +[tool.mypy] +check_untyped_defs = true +disallow_untyped_defs = true +enable_error_code = ['ignore-without-code'] +exclude = ['docs/'] +mypy_path = '$MYPY_CONFIG_FILE_DIR/typings' +no_implicit_optional = true +show_error_codes = true +warn_redundant_casts = true +warn_return_any = true +warn_unused_configs = true +warn_unused_ignores = true +warn_unreachable = true + +[[tool.mypy.overrides]] +module = ['transformers', 'transformers.*', 'torch', 'torch.*'] +ignore_missing_imports = true + +[tool.coverage.run] +include = [] + +[tool.coverage.report] +exclude_lines = [ + 'pragma: no cover', + 'raise NotImplementedError', + 'if __name__ == "__main__":', + 'if TYPE_CHECKING:', + 'def __repr__', +] + +[tool.black] +line-length = 120 + +[tool.ruff] +select = ["E", "F", "W", "I", "N"] +line-length = 120 + +[tool.ruff.per-file-ignores] +'__init__.py' = ["F401"] diff --git a/colpali-main/requirements.txt b/colpali-main/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..49e9854a685c9bd586746ea5d012092f30d35c9c --- /dev/null +++ b/colpali-main/requirements.txt @@ -0,0 +1,7 @@ +-e . + +black +pytest +pdf2image +Pillow +typer \ No newline at end of file diff --git a/colpali-main/scripts/configs/data/debug_data.yaml b/colpali-main/scripts/configs/data/debug_data.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6988d4b20c321ca7d0530ac3211ca4971dfe4f88 --- /dev/null +++ b/colpali-main/scripts/configs/data/debug_data.yaml @@ -0,0 +1,3 @@ +syntheticDocQA_energy: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: vidore/syntheticDocQA_energy_test \ No newline at end of file diff --git a/colpali-main/scripts/configs/data/test_data.yaml b/colpali-main/scripts/configs/data/test_data.yaml new file mode 100644 index 0000000000000000000000000000000000000000..edc14c51687bbeeb7f51e695fc923ce7ccb52d3c --- /dev/null +++ b/colpali-main/scripts/configs/data/test_data.yaml @@ -0,0 +1,31 @@ +# eval_dataset_loader: +syntheticDocQA_energy: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/syntheticDocQA_energy_test +syntheticDocQA_healthcare_industry: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/syntheticDocQA_healthcare_industry_test +syntheticDocQA_artificial_intelligence_test: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/syntheticDocQA_artificial_intelligence_test +syntheticDocQA_government_reports: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/syntheticDocQA_government_reports_test +infovqa_subsampled: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/infovqa_test_subsampled +docvqa_subsampled: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/docvqa_test_subsampled +arxivqa_subsampled: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/arxivqa_test_subsampled +tabfquad_subsampled: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/tabfquad_test_subsampled +tatdqa: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/tatdqa_test +shift_project: + (): colpali_engine.utils.dataset_transformation.TestSetFactory + dataset_path: !path ../../../data_dir/shiftproject_test \ No newline at end of file diff --git a/colpali-main/scripts/configs/idefics/eval_colidefics_model.yaml b/colpali-main/scripts/configs/idefics/eval_colidefics_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f84ca4576d791cd7add43d423ce206e055116ae --- /dev/null +++ b/colpali-main/scripts/configs/idefics/eval_colidefics_model.yaml @@ -0,0 +1,43 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + processor: + (): colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b" + do_image_splitting: false + pretrained_peft_model_name_or_path: !path ../../../models/HuggingFaceM4_idefics2-8b-chatty + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b-chatty" + training_objective: "colbertv1" + # attn_implementation: "flash_attention_2" + torch_dtype: !ext torch.bfloat16 + quantization_config: + (): transformers.BitsAndBytesConfig + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "float16" + bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_tabfquad_retrieving + max_length: 256 + run_eval: true + run_train: false + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 8 + per_device_eval_batch_size: 4 + eval_strategy: "steps" + dataloader_num_workers: 8 + # bf16: true + save_steps: 500 + logging_steps: 10 + eval_steps: 50 + warmup_steps: 100 + learning_rate: 5e-5 + save_total_limit: 1 diff --git a/colpali-main/scripts/configs/idefics/train_biidefics_model.yaml b/colpali-main/scripts/configs/idefics/train_biidefics_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7ac3bc492d08db46c165988d3aff6205b7ebbce6 --- /dev/null +++ b/colpali-main/scripts/configs/idefics/train_biidefics_model.yaml @@ -0,0 +1,52 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/biidefics2-8b-chatty + processor: + (): colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b" + do_image_splitting: false + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b-chatty" + training_objective: "biencoder" + # attn_implementation: "flash_attention_2" + torch_dtype: !ext torch.bfloat16 + quantization_config: + (): transformers.BitsAndBytesConfig + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "float16" + bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset + max_length: 256 + run_eval: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 8 + per_device_eval_batch_size: 4 + eval_strategy: "steps" + dataloader_num_workers: 8 + # bf16: true + save_steps: 500 + logging_steps: 10 + eval_steps: 50 + warmup_steps: 100 + learning_rate: 5e-5 + save_total_limit: 1 + + peft_config: + (): peft.LoraConfig + r: 8 + lora_alpha: 8 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' diff --git a/colpali-main/scripts/configs/idefics/train_colidefics2_model.yaml b/colpali-main/scripts/configs/idefics/train_colidefics2_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0597093d17b751d433282e94ff1df603586a2a60 --- /dev/null +++ b/colpali-main/scripts/configs/idefics/train_colidefics2_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colidefics2-60 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/idefics2-8b" + do_image_splitting: false + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/idefics2-8b" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/idefics/train_colidefics_model.yaml b/colpali-main/scripts/configs/idefics/train_colidefics_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bfe68cb0e8a54a55ca26922458d3a1caf2ff44b4 --- /dev/null +++ b/colpali-main/scripts/configs/idefics/train_colidefics_model.yaml @@ -0,0 +1,54 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/colidefics2-8b-chatty-long + processor: + (): colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b" + do_image_splitting: false + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b-chatty" + training_objective: "colbertv1" + # attn_implementation: "flash_attention_2" + torch_dtype: !ext torch.bfloat16 + device_map: "auto" + quantization_config: + (): transformers.BitsAndBytesConfig + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset + max_length: 380 + run_eval: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 8 + per_device_eval_batch_size: 4 + eval_strategy: "steps" + dataloader_num_workers: 8 + # bf16: true + # gradient_checkpointing: true + save_steps: 500 + logging_steps: 10 + eval_steps: 50 + warmup_steps: 100 + learning_rate: 5e-5 + save_total_limit: 1 + + peft_config: + (): peft.LoraConfig + r: 8 + lora_alpha: 8 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' diff --git a/colpali-main/scripts/configs/idefics/train_colidefics_model_debug.yaml b/colpali-main/scripts/configs/idefics/train_colidefics_model_debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..adb532d0efc349eba18661206fd5340fca50f627 --- /dev/null +++ b/colpali-main/scripts/configs/idefics/train_colidefics_model_debug.yaml @@ -0,0 +1,54 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "HuggingFaceM4/idefics2-8b" + do_image_splitting: false + max_length: 256 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "dondosss/tiny-random-idefics2" + training_objective: "biencoder" + # attn_implementation: "eager" + torch_dtype: !ext torch.float16 + quantization_config: + (): transformers.BitsAndBytesConfig + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "float16" + bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset + max_length: 256 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 2 + per_device_eval_batch_size: 1 + eval_strategy: "steps" + dataloader_num_workers: 8 + max_steps: 20 + bf16: false + save_steps: 500 + logging_steps: 10 + eval_steps: 10 + warmup_steps: 500 + learning_rate: 5e-5 + save_total_limit: 1 + + peft_config: + (): peft.LoraConfig + r: 8 + lora_alpha: 8 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' diff --git a/colpali-main/scripts/configs/pali/eval_bipali_model.yaml b/colpali-main/scripts/configs/pali/eval_bipali_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6b92ba71a8bd5c55be0efd7ca7a659cc2e55414f --- /dev/null +++ b/colpali-main/scripts/configs/pali/eval_bipali_model.yaml @@ -0,0 +1,31 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/eval_v0_bipali_mean-3b-mix-448 + pretrained_peft_model_name_or_path: !path ../../../models/train_v0_bipali_mean-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "biencoder_mean" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiPairwiseCELoss + tr_args: !import ../tr_args/eval_tr_args.yaml diff --git a/colpali-main/scripts/configs/pali/eval_bisiglip_model.yaml b/colpali-main/scripts/configs/pali/eval_bisiglip_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3a5deb76c7454ee576a9c3327536e37ee5dd5685 --- /dev/null +++ b/colpali-main/scripts/configs/pali/eval_bisiglip_model.yaml @@ -0,0 +1,33 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/eval_v0_bisiglipnew-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + pretrained_peft_model_name_or_path: !path ../../../models/train_v0_bisiglipnew-3b-mix-448 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "biencoder_mean_vision" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + run_train: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiPairwiseCELoss + tr_args: !import ../tr_args/eval_tr_args.yaml + diff --git a/colpali-main/scripts/configs/pali/eval_colpali_model.yaml b/colpali-main/scripts/configs/pali/eval_colpali_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7097bf0f4f25aac0ffca07fbf5c5f7ed275e028b --- /dev/null +++ b/colpali-main/scripts/configs/pali/eval_colpali_model.yaml @@ -0,0 +1,33 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/results/evals/eval_colpali-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + pretrained_peft_model_name_or_path: !path ../../../models/results/without_tabfquad/train_colpali-3b-mix-448 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + run_train: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/eval_tr_args.yaml + diff --git a/colpali-main/scripts/configs/pali/eval_colsiglip_model.yaml b/colpali-main/scripts/configs/pali/eval_colsiglip_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9f91c05ecb73ba1781f7cbebb0816aa70bf33968 --- /dev/null +++ b/colpali-main/scripts/configs/pali/eval_colsiglip_model.yaml @@ -0,0 +1,33 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/eval_v0_colsiglipnew-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + pretrained_peft_model_name_or_path: !path ../../../models/train_v0_colsiglipnew-3b-mix-448 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + run_train: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/eval_tr_args.yaml + diff --git a/colpali-main/scripts/configs/pali/train_bipali_all_model.yaml b/colpali-main/scripts/configs/pali/train_bipali_all_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d59853840cbaefc0d65efb47aed8481efd3d6487 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_bipali_all_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_bipali_all_mean-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "biencoder_mean" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model|vision_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(multi_modal_projector\.linear).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' + diff --git a/colpali-main/scripts/configs/pali/train_bipali_model.yaml b/colpali-main/scripts/configs/pali/train_bipali_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a3280d56dbe1a00eb4b898e970079f46737231a --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_bipali_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_bipali_mean-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "biencoder_mean" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$' + diff --git a/colpali-main/scripts/configs/pali/train_bisiglip_new_model.yaml b/colpali-main/scripts/configs/pali/train_bisiglip_new_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..190b2866aec93d267dd6e5995287de491010012f --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_bisiglip_new_model.yaml @@ -0,0 +1,38 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_bisiglip_new-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "biencoder_mean_vision" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)' diff --git a/colpali-main/scripts/configs/pali/train_colpali_224_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_224_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c1ac7db622d08838b37dfb8e8dce230aaf62ecf1 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_224_model.yaml @@ -0,0 +1,42 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-ft-ocrvqa-224 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-224" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-224" + training_objective: "colbertv1" + # attn_implementation: "eager" + # attn_implementation: "flash_attention_2" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colpali_448_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_448_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ed901b78d1029b39a69be568c9056c1ab32d3640 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_448_model.yaml @@ -0,0 +1,42 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-ft-ocrvqa-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + # attn_implementation: "flash_attention_2" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colpali_896_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_896_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..014a4a59ba3c8e2e48ed832338789ff942357385 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_896_model.yaml @@ -0,0 +1,42 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-ft-ocrvqa-896 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-896" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-ft-ocrvqa-896" + training_objective: "colbertv1" + # attn_implementation: "eager" + # attn_implementation: "flash_attention_2" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colpali_all_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_all_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc7a81dab3c2e8db597f72e0842148f1e85aa617 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_all_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_colpali_all-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model|vision_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(multi_modal_projector\.linear).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colpali_docmatix_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_docmatix_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e2a4e481dca0c630339cdcd3f935db48cbd7d46 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_docmatix_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/train_colpali-docmatix-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set_with_docmatix + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colpali_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b7de6678594fcb52d5d52801402cae3d1edda153 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colpali_pt_model.yaml b/colpali-main/scripts/configs/pali/train_colpali_pt_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c3950e215ea1a7e464e27b73ceb687dc521a2cf9 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colpali_pt_model.yaml @@ -0,0 +1,41 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad/train_colpali-3b-pt-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-pt-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-pt-448" + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertPairwiseCELoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + # target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_text_proj).*$)' + diff --git a/colpali-main/scripts/configs/pali/train_colsiglip_new_model.yaml b/colpali-main/scripts/configs/pali/train_colsiglip_new_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d1be827506a5dfb46b8dc5371a85a8f8b0484ca7 --- /dev/null +++ b/colpali-main/scripts/configs/pali/train_colsiglip_new_model.yaml @@ -0,0 +1,38 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_colsiglip_new-3b-mix-448 + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + max_length: 50 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "./models/paligemma-3b-mix-448" + training_objective: "colbertv1_vision" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + max_length: 50 + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(language_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)' diff --git a/colpali-main/scripts/configs/siglip/eval_bisiglip_model.yaml b/colpali-main/scripts/configs/siglip/eval_bisiglip_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e6f0913ca7f6f2434d959e8dc801fb7f3540ac73 --- /dev/null +++ b/colpali-main/scripts/configs/siglip/eval_bisiglip_model.yaml @@ -0,0 +1,40 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/eval_real_siglip_untrained + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384 + max_length: 64 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384 + training_objective: "biencoder_mean" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 64 + run_train: true + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiPairwiseCELoss + tr_args: !import ../tr_args/eval_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)' diff --git a/colpali-main/scripts/configs/siglip/train_bisiglip_model.yaml b/colpali-main/scripts/configs/siglip/train_bisiglip_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c0d0405d4f0c4ba186a173ad48ae4c896635525 --- /dev/null +++ b/colpali-main/scripts/configs/siglip/train_bisiglip_model.yaml @@ -0,0 +1,40 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_real_siglip_text_only + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384 + max_length: 64 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384 + training_objective: "biencoder_mean" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 64 + run_train: true + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$)' diff --git a/colpali-main/scripts/configs/siglip/train_colsiglip_model.yaml b/colpali-main/scripts/configs/siglip/train_colsiglip_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..72e63f59d4ae02a2fbe3f88192ea9bcb52f2fc8d --- /dev/null +++ b/colpali-main/scripts/configs/siglip/train_colsiglip_model.yaml @@ -0,0 +1,42 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_real_colsiglip_text_only + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384 + max_length: 64 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: !path ../../../models/siglip-so400m-patch14-384 + training_objective: "colbertv1" + # attn_implementation: "eager" + torch_dtype: !ext torch.bfloat16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_train_set + eval_dataset_loader: !import ../data/test_data.yaml + + max_length: 64 + run_train: true + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + tr_args: !import ../tr_args/default_tr_args.yaml + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + # target_modules: '(.*(text_model|vision_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)' + target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)' + diff --git a/colpali-main/scripts/configs/siglip/train_siglip_model_debug.yaml b/colpali-main/scripts/configs/siglip/train_siglip_model_debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..86e440fc700de4ed86f77f19873e537b5d5464f0 --- /dev/null +++ b/colpali-main/scripts/configs/siglip/train_siglip_model_debug.yaml @@ -0,0 +1,60 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/without_tabfquad_no_pairwise/train_real_siglip + processor: + () : colpali_engine.utils.wrapper.AutoProcessorWrapper + pretrained_model_name_or_path: google/siglip-so400m-patch14-384 + max_length: 64 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: google/siglip-so400m-patch14-384 + training_objective: "biencoder_mean" + # attn_implementation: "eager" + torch_dtype: !ext torch.float16 +# device_map: "auto" +# quantization_config: +# (): transformers.BitsAndBytesConfig +# load_in_4bit: true +# bnb_4bit_quant_type: "nf4" +# bnb_4bit_compute_dtype: "bfloat16" +# bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_docvqa_dataset + eval_dataset_loader: !import ../data/debug_data.yaml + + max_length: 64 + run_train: true + run_eval: true + add_suffix: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 1 + per_device_train_batch_size: 2 + max_steps: 10 + # 6 x 8 gpus = 48 batch size + # gradient_accumulation_steps: 4 + per_device_eval_batch_size: 2 + eval_strategy: "steps" + # dataloader_num_workers: 8 + # bf16: true + save_steps: 500 + logging_steps: 10 + eval_steps: 50 + warmup_steps: 100 + learning_rate: 5e-5 + save_total_limit: 1 + optim: "paged_adamw_8bit" + + peft_config: + (): peft.LoraConfig + r: 32 + lora_alpha: 32 + lora_dropout: 0.1 + init_lora_weights: "gaussian" + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: '(.*(text_model).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*$|.*(custom_(text|image)_proj).*$)' diff --git a/colpali-main/scripts/configs/text_only/train_bibert_model.yaml b/colpali-main/scripts/configs/text_only/train_bibert_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c44f68b8508d9a6bf7a91f184b1881f469575dda --- /dev/null +++ b/colpali-main/scripts/configs/text_only/train_bibert_model.yaml @@ -0,0 +1,28 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/bixlm-roberta-base + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "FacebookAI/xlm-roberta-base" + training_objective: "biencoder" + add_suffix: true + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings + max_length: 256 + run_eval: true + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 64 + gradient_accumulation_steps: 2 + per_device_eval_batch_size: 8 + dataloader_num_workers: 8 + # bf16: true + save_steps: 500 + logging_steps: 50 + warmup_steps: 500 + learning_rate: 5e-5 + save_total_limit: 1 diff --git a/colpali-main/scripts/configs/text_only/train_colbert_model.yaml b/colpali-main/scripts/configs/text_only/train_colbert_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3237715ad8d89eacd46a2ee1508a4ebfef4e11a --- /dev/null +++ b/colpali-main/scripts/configs/text_only/train_colbert_model.yaml @@ -0,0 +1,27 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "FacebookAI/xlm-roberta-base" + + add_suffix: true + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings + max_length: 256 + run_eval: true + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 64 + gradient_accumulation_steps: 2 + per_device_eval_batch_size: 8 + dataloader_num_workers: 8 + # bf16: true + save_steps: 500 + logging_steps: 50 + warmup_steps: 500 + learning_rate: 5e-5 + save_total_limit: 1 diff --git a/colpali-main/scripts/configs/text_only/train_colbert_model_debug.yaml b/colpali-main/scripts/configs/text_only/train_colbert_model_debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ec4c6d8e8ee758a311f582cff1425ec82464289a --- /dev/null +++ b/colpali-main/scripts/configs/text_only/train_colbert_model_debug.yaml @@ -0,0 +1,42 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "bert-base-uncased" + training_objective: "biencoder" # "biencoder" + torch_dtype: !ext torch.float16 + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings + + loss_func: + (): colpali_engine.loss.colbert_loss.BiEncoderLoss # BiEncoderLoss # ColbertLoss + max_length: 128 + run_eval: true + run_train: true + add_suffix: true + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 1 + max_steps: 10 + per_device_train_batch_size: 8 + gradient_accumulation_steps: 2 + per_device_eval_batch_size: 8 + dataloader_num_workers: 8 + bf16: false + save_steps: 50 + eval_steps: 50 + eval_strategy: "steps" + logging_steps: 10 + warmup_steps: 10 + learning_rate: 5e-4 + save_total_limit: 1 + + peft_config: + (): peft.LoraConfig + r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + bias: "none" + task_type: "FEATURE_EXTRACTION" \ No newline at end of file diff --git a/colpali-main/scripts/configs/text_only/train_colllama_model.yaml b/colpali-main/scripts/configs/text_only/train_colllama_model.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b5081cf94dc60bc0599698a46e995a76f7ff50ba --- /dev/null +++ b/colpali-main/scripts/configs/text_only/train_colllama_model.yaml @@ -0,0 +1,53 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + output_dir: !path ../../../models/lora_CroissantCool-v0.2 + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "croissantllm/CroissantCool-v0.2" + attn_implementation: "flash_attention_2" + quantization_config: + (): transformers.BitsAndBytesConfig + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "bfloat16" + bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + max_length: 256 + run_eval: true + add_suffix: true + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 3 + per_device_train_batch_size: 128 + gradient_accumulation_steps: 1 + per_device_eval_batch_size: 32 + dataloader_num_workers: 8 + bf16: true + save_steps: 500 + eval_steps: 50 + eval_strategy: "steps" + logging_steps: 10 + warmup_steps: 100 + learning_rate: 5e-5 + save_total_limit: 1 + + peft_config: + (): peft.LoraConfig + r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: + - 'up_proj' + - 'down_proj' + - 'gate_proj' + - 'k_proj' + - 'q_proj' + - 'v_proj' + - 'o_proj' diff --git a/colpali-main/scripts/configs/text_only/train_colllama_model_debug.yaml b/colpali-main/scripts/configs/text_only/train_colllama_model_debug.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5e750399b54b23305d8bf84fef79d6421e3d3013 --- /dev/null +++ b/colpali-main/scripts/configs/text_only/train_colllama_model_debug.yaml @@ -0,0 +1,51 @@ +config: + (): colpali_engine.utils.train_colpali_engine_models.ColModelTrainingConfig + model: + (): colpali_engine.utils.wrapper.AutoColModelWrapper + pretrained_model_name_or_path: "HuggingFaceM4/tiny-random-LlamaForCausalLM" + # attn_implementation: "eager" + quantization_config: + (): transformers.BitsAndBytesConfig + load_in_4bit: true + bnb_4bit_quant_type: "nf4" + bnb_4bit_compute_dtype: "float16" + bnb_4bit_use_double_quant: true + + dataset_loading_func: !ext colpali_engine.utils.dataset_transformation.load_manu_embeddings + loss_func: + (): colpali_engine.loss.colbert_loss.ColbertLoss + max_length: 256 + run_eval: true + add_suffix: true + tr_args: + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 1 + per_device_train_batch_size: 64 + gradient_accumulation_steps: 2 + per_device_eval_batch_size: 8 + dataloader_num_workers: 8 + bf16: false + save_steps: 500 + logging_steps: 10 + warmup_steps: 500 + learning_rate: 5e-5 + save_total_limit: 1 + optim: "paged_adamw_8bit" + + peft_config: + (): peft.LoraConfig + r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + bias: "none" + task_type: "FEATURE_EXTRACTION" + target_modules: + - 'up_proj' + - 'down_proj' + - 'gate_proj' + - 'k_proj' + - 'q_proj' + - 'v_proj' + - 'o_proj' diff --git a/colpali-main/scripts/configs/tr_args/default_tr_args.yaml b/colpali-main/scripts/configs/tr_args/default_tr_args.yaml new file mode 100644 index 0000000000000000000000000000000000000000..03186555b5c266dc57a9ffd68e752489ec281639 --- /dev/null +++ b/colpali-main/scripts/configs/tr_args/default_tr_args.yaml @@ -0,0 +1,18 @@ +(): transformers.training_args.TrainingArguments +output_dir: null +overwrite_output_dir: true +num_train_epochs: 1 +per_device_train_batch_size: 4 +# 6 x 8 gpus = 48 batch size +# gradient_accumulation_steps: 4 +per_device_eval_batch_size: 4 +eval_strategy: "steps" +# dataloader_num_workers: 8 +# bf16: true +save_steps: 500 +logging_steps: 10 +eval_steps: 50 +warmup_steps: 100 +learning_rate: 5e-5 +save_total_limit: 1 +# optim: "paged_adamw_8bit" diff --git a/colpali-main/scripts/configs/tr_args/eval_tr_args.yaml b/colpali-main/scripts/configs/tr_args/eval_tr_args.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e6fb7ea7f7ffaab41c5e2ef03f0a3ea12cb753e0 --- /dev/null +++ b/colpali-main/scripts/configs/tr_args/eval_tr_args.yaml @@ -0,0 +1,19 @@ + (): transformers.training_args.TrainingArguments + output_dir: null + overwrite_output_dir: true + num_train_epochs: 1 + per_device_train_batch_size: 4 + # 6 x 8 gpus = 48 batch size + # gradient_accumulation_steps: 4 + per_device_eval_batch_size: 4 + max_steps: 10 + eval_strategy: "steps" + # dataloader_num_workers: 8 + # bf16: true + save_steps: 500 + logging_steps: 10 + eval_steps: 50 + warmup_steps: 100 + learning_rate: 5e-5 + save_total_limit: 1 + optim: "paged_adamw_8bit" \ No newline at end of file diff --git a/colpali-main/scripts/infer/run_inference_with_python.py b/colpali-main/scripts/infer/run_inference_with_python.py new file mode 100644 index 0000000000000000000000000000000000000000..56384efedf61cf63722c0eee977fd3fa89670340 --- /dev/null +++ b/colpali-main/scripts/infer/run_inference_with_python.py @@ -0,0 +1,68 @@ +import sys +import os + +import torch +import typer +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoProcessor +from PIL import Image + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) + +from colpali_engine.models.paligemma_colbert_architecture import ColPali +from colpali_engine.trainer.retrieval_evaluator import CustomEvaluator +from colpali_engine.utils.colpali_processing_utils import process_images, process_queries +from colpali_engine.utils.image_from_page_utils import load_from_dataset + + +def main() -> None: + """Example script to run inference with ColPali""" + + # Load model + model_name = "vidore/colpali" + model = ColPali.from_pretrained("google/paligemma-3b-mix-448", torch_dtype=torch.bfloat16, device_map="cpu").eval() + model.load_adapter(model_name) + processor = AutoProcessor.from_pretrained(model_name) + + # select images -> load_from_pdf(), load_from_image_urls([""]), load_from_dataset() + images = load_from_dataset("vidore/docvqa_test_subsampled") + queries = ["From which university does James V. Fiorca come ?", "Who is the japanese prime minister?"] + + # run inference - docs + dataloader = DataLoader( + images, + batch_size=4, + shuffle=False, + collate_fn=lambda x: process_images(processor, x), + ) + ds = [] + for batch_doc in tqdm(dataloader): + with torch.no_grad(): + batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()} + embeddings_doc = model(**batch_doc) + ds.extend(list(torch.unbind(embeddings_doc.to("cpu")))) + + # run inference - queries + dataloader = DataLoader( + queries, + batch_size=4, + shuffle=False, + collate_fn=lambda x: process_queries(processor, x, Image.new("RGB", (448, 448), (255, 255, 255))), + ) + + qs = [] + for batch_query in dataloader: + with torch.no_grad(): + batch_query = {k: v.to(model.device) for k, v in batch_query.items()} + embeddings_query = model(**batch_query) + qs.extend(list(torch.unbind(embeddings_query.to("cpu")))) + + # run evaluation + retriever_evaluator = CustomEvaluator(is_multi_vector=True) + scores = retriever_evaluator.evaluate(qs, ds) + print(scores.argmax(axis=1)) + + +if __name__ == "__main__": + typer.run(main) diff --git a/colpali-main/scripts/train/train_colbert.py b/colpali-main/scripts/train/train_colbert.py new file mode 100644 index 0000000000000000000000000000000000000000..f06c299101bd4bf7a160aafe0a24157deddda8c3 --- /dev/null +++ b/colpali-main/scripts/train/train_colbert.py @@ -0,0 +1,29 @@ +from pathlib import Path +from colpali_engine.utils.train_colpali_engine_models import ColModelTrainingConfig, ColModelTraining +from colpali_engine.utils.gpu_stats import print_gpu_utilization +import typer +import configue + + +def main(config_file: Path) -> None: + print_gpu_utilization() + print("Loading config") + config = configue.load(config_file, sub_path="config") + print("Creating Setup") + if isinstance(config, ColModelTrainingConfig): + app = ColModelTraining(config) + else: + raise ValueError("Config must be of type ColModelTrainingConfig") + + if config.run_train: + print("Training model") + app.train() + app.save(config_file=config_file) + if config.run_eval: + print("Running evaluation") + app.eval() + print("Done!") + + +if __name__ == "__main__": + typer.run(main)