Introduce a custom Sentence Transformer module for smooth multi-modality

by tomaarsen HF staff - opened 6 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+134

-77

Files changed (4) hide show

README.md +31 -70
custom_st.py +87 -0
modules.json +12 -6
sentence_bert_config.json +4 -1

README.md CHANGED Viewed

@@ -8983,66 +8983,29 @@ The core training code will be integrated into the rag-retrieval library(https:/
 This work was accomplished during my free time; please grant time a little time.
-## Usage
-```python
-import functools
-import PIL
-import numpy as np
-import torch
-from typing import Dict
-from io import BytesIO
-from transformers import SiglipImageProcessor
-from sentence_transformers import SentenceTransformer
-def jasper_vl_forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
-    trans_features = {"input_ids": features["input_ids"], "attention_mask": features["attention_mask"]}
-    if "pixel_values" in features:
-        trans_features["pixel_values"] = features["pixel_values"]
-    sentence_embedding = self.auto_model(**trans_features, **kwargs)["sentence_embedding"]
-    features.update({"sentence_embedding": sentence_embedding})
-    return features
-def jasper_vl_tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
-    img_start_token = "<|jasper_img_start|>"
-    img_token = "<|jasper_img_token|>"
-    img_end_token = "<|jasper_img_end|>"
-    num_img_tokens = 300
-    def process_text_item(item):
-        if isinstance(item, str):
-            return item, []
-        text, images = "", []
-        for sub_item in item:
-            if sub_item["type"] == "text":
-                text += sub_item["content"]
-            elif sub_item["type"] == "image_bytes":
-                text += img_start_token + img_token * num_img_tokens + img_end_token
-                images.append(PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB"))
-            elif sub_item["type"] == "image_path":
-                text += img_start_token + img_token * num_img_tokens + img_end_token
-                images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
-            else:
-                raise ValueError(f"unknown data type {sub_item['type']}")
-        return text, images
-    all_texts, all_images = [], []
-    for item in texts:
-        text, images = process_text_item(item)
-        all_texts.append(text)
-        all_images.extend(images)
-    ipt = self.tokenizer(all_texts, padding="longest", truncation=True, max_length=1024, return_tensors="pt")
-    if all_images:
-        ipt["pixel_values"] = self.processor(
-            images=all_images,
-            return_tensors="pt"
-        )["pixel_values"]
-        # For the sake of demonstration, external variables are used here, please modify the code according to your own environment.
-        if use_gpu:
-            ipt["pixel_values"] = ipt["pixel_values"].bfloat16()
-    return ipt
 DOC1 = """
@@ -9062,10 +9025,6 @@ Color combinations: Decide how to best complement your preferred color with othe
 Color palette: Limit your color palette to a main color and one or two additional colors.
 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
 """
-prompt_dict = {
-    "s2p_query": "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: ",
-    "s2s_query": "Instruct: Retrieve semantically similar text.\nQuery: "
-}
 if __name__ == "__main__":
     # load model
     use_gpu = False
@@ -9073,7 +9032,7 @@ if __name__ == "__main__":
     model = SentenceTransformer(
         model_name,
         trust_remote_code=True,
-        device="cpu",
         model_kwargs={
             "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
             "attn_implementation": "sdpa"
@@ -9082,13 +9041,10 @@ if __name__ == "__main__":
         ## 1024 is recommended
         # set is_text_encoder 'True', if you do not encode image
         config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
-        tokenizer_kwargs={"padding_side": "right"}
     )
-    # jasper model cannot directly be used in SentenceTransformer, do some modifications
-    model.processor = SiglipImageProcessor.from_pretrained(model_name)
-    model.tokenize = functools.partial(jasper_vl_tokenize, model)
-    model._first_module().forward = functools.partial(jasper_vl_forward, model._first_module())
     model.max_seq_length = 1024
     # data
     q_list = [
         "Why the sky is blue?",
@@ -9099,16 +9055,21 @@ if __name__ == "__main__":
         [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
         DOC2,
         [{"type": "image_path", "content": "./assets/img2.png"}],
     ]
-    q_vecs = model.encode([prompt_dict["s2p_query"] + text for text in q_list], normalize_embeddings=True)
-    doc_vecs = model.encode(doc_list, normalize_embeddings=True)
-    print(np.matmul(q_vecs, doc_vecs.T))
     # the output is:
-    # [[0.777521   0.75944513 0.24291277 0.2187205]
-    #  [0.32261407 0.30536035 0.74208796 0.5484469]]
-```
 ## License
 **This model should not be used for any commercial purpose!**

 This work was accomplished during my free time; please grant time a little time.
+Here's a short introduction to the training method:
+The core idea of jasper and stella is distillation: **Let student model learn teacher model's vectors.**
+The training process of jasper have 4 stage:
+Stage1&2: Distill from teacher vectors. In jasper model the teacher model is nvidia/NV-Embed-v2 and dunzhang/stella_en_1.5B_v5 (Stage1 and Stage2 will freeze different parameters.)
+Stage3: MRL training, I made some modifications to MRL to enable training on unsupervised text
+Stage4: Alignment between *jasper token embeddings from image's detailed caption* and *vision embeddings from google/siglip-so400m-patch14-384*.
+I use a AdaptiveAvgPool2d to do an adjustment on vision tokens' number and dimensions, this method does not need additional parameters.
+**The meaning of distillation is to achieve better results with smaller models or as a way of pre-training, not to hit the top of the leaderboards.**
+Actually, I've got first place on MTEB (Chinese and English), I will not release the two models, as I said before, it's meaningless.
+## Usage
+```python
+import torch
+from sentence_transformers import SentenceTransformer
 DOC1 = """
 Color palette: Limit your color palette to a main color and one or two additional colors.
 60-30-10 rule: Use a primary color 60% of the time, a secondary color 30% of the time, and an accent color 10% of the time
 """
 if __name__ == "__main__":
     # load model
     use_gpu = False
     model = SentenceTransformer(
         model_name,
         trust_remote_code=True,
+        device="cpu" if not use_gpu else "cuda",
         model_kwargs={
             "torch_dtype": torch.bfloat16 if use_gpu else torch.float32,
             "attn_implementation": "sdpa"
         ## 1024 is recommended
         # set is_text_encoder 'True', if you do not encode image
         config_kwargs={"is_text_encoder": False, "vector_dim": 1024},
     )
+    # We can reduce the max_seq_length from the default of 2048 for faster encoding
     model.max_seq_length = 1024
     # data
     q_list = [
         "Why the sky is blue?",
         [{"type": "image_path", "content": "./assets/img1.png"}, {"type": "text", "content": "Hope this image helps!"}],
         DOC2,
         [{"type": "image_path", "content": "./assets/img2.png"}],
     ]
+    q_vecs = model.encode(q_list, prompt_name="s2p_query")
+    doc_vecs = model.encode(doc_list)
+    # calculate similarity
+    similarities = model.similarity(q_vecs, doc_vecs)
+    print(similarities)
     # the output is:
+    # tensor([[0.7775, 0.7594, 0.2429, 0.2187],
+    #         [0.3226, 0.3054, 0.7421, 0.5484]])
+```
+## Evaluation on MTEB
+script: ./scripts/evaluate_en_mteb/run_evaluate_mteb.py
 ## License
 **This model should not be used for any commercial purpose!**

custom_st.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from typing import Any, Dict, Optional
+import PIL
+import torch
+import PIL
+import torch
+from typing import Dict
+from io import BytesIO
+from transformers import SiglipImageProcessor
+from sentence_transformers.models import Transformer as BaseTransformer
+class MultiModalTransformer(BaseTransformer):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        tokenizer_args: Optional[Dict[str, Any]] = None,
+        **kwargs,
+    ):
+        super().__init__(model_name_or_path, **kwargs)
+        if tokenizer_args is None:
+            tokenizer_args = {}
+        self.processor = SiglipImageProcessor.from_pretrained(
+            model_name_or_path, cache_dir=cache_dir, **tokenizer_args
+        )
+    def forward(
+        self, features: dict[str, torch.Tensor], **kwargs
+    ) -> dict[str, torch.Tensor]:
+        trans_features = {
+            "input_ids": features["input_ids"],
+            "attention_mask": features["attention_mask"],
+        }
+        if "pixel_values" in features:
+            trans_features["pixel_values"] = features["pixel_values"].to(
+                self.auto_model.dtype
+            )
+        sentence_embedding = self.auto_model(**trans_features, **kwargs)[
+            "sentence_embedding"
+        ]
+        features.update({"sentence_embedding": sentence_embedding})
+        return features
+    def tokenize(self, texts: list[Dict] | list[str]) -> dict[str, torch.Tensor]:
+        img_start_token = "<|jasper_img_start|>"
+        img_token = "<|jasper_img_token|>"
+        img_end_token = "<|jasper_img_end|>"
+        num_img_tokens = 300
+        def process_text_item(item):
+            if isinstance(item, str):
+                return item, []
+            text, images = "", []
+            for sub_item in item:
+                if sub_item["type"] == "text":
+                    text += sub_item["content"]
+                elif sub_item["type"] == "image_bytes":
+                    text += img_start_token + img_token * num_img_tokens + img_end_token
+                    images.append(
+                        PIL.Image.open(BytesIO(sub_item["content"])).convert("RGB")
+                    )
+                elif sub_item["type"] == "image_path":
+                    text += img_start_token + img_token * num_img_tokens + img_end_token
+                    images.append(PIL.Image.open(sub_item["content"]).convert("RGB"))
+                else:
+                    raise ValueError(f"unknown data type {sub_item['type']}")
+            return text, images
+        all_texts, all_images = [], []
+        for item in texts:
+            text, images = process_text_item(item)
+            all_texts.append(text)
+            all_images.extend(images)
+        ipt = self.tokenizer(
+            all_texts,
+            padding="longest",
+            truncation=True,
+            max_length=self.max_seq_length,
+            return_tensors="pt",
+        )
+        if all_images:
+            ipt["pixel_values"] = self.processor(
+                images=all_images, return_tensors="pt"
+            )["pixel_values"]
+        return ipt

modules.json CHANGED Viewed

@@ -1,8 +1,14 @@
 [
- {
-  "idx": 0,
-  "name": "0",
-  "path": "",
-  "type": "sentence_transformers.models.Transformer"
- }
 ]

 [
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "custom_st.MultiModalTransformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
 ]

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,7 @@
 {
  "max_seq_length": 2048,
- "do_lower_case": false
 }

 {
  "max_seq_length": 2048,
+ "do_lower_case": false,
+ "tokenizer_args": {
+    "padding_side": "right"
+ }
 }