jinaai
/

jina-clip-implementation

@@ -5,8 +5,9 @@
 # and adjusted for Jina CLIP
 from functools import partial
-from typing import Optional, Tuple, Union
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
@@ -18,6 +19,12 @@ from transformers.models.clip.modeling_clip import (
     CLIPVisionModelOutput,
     clip_loss,
 )
 from .configuration_clip import JinaCLIPConfig, JinaCLIPTextConfig, JinaCLIPVisionConfig
 from .eva_model import EVAVisionTransformer
@@ -215,6 +222,8 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
             self.visual_projection = nn.Identity()
             self.text_projection = nn.Identity()
         self.post_init()
     def get_text_features(
@@ -239,33 +248,222 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         )
         return self.visual_projection(self.vision_model(x=x))
     def encode_text(
         self,
-        input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
-        return_dict: Optional[bool] = None,
-        *_,
-        **__,
-    ) -> Union[Tuple[Optional[torch.FloatTensor], ...], CLIPTextModelOutput]:
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        feats = self.get_text_features(input_ids=input_ids)
-        out = CLIPTextModelOutput(text_embeds=feats)
-        return out if return_dict else out.to_tuple()
     def encode_image(
         self,
-        pixel_values: Union[None, torch.FloatTensor, BatchFeature] = None,
-        return_dict: Optional[bool] = None,
-        *_,
-        **__,
-    ) -> Union[Tuple[Optional[torch.FloatTensor], ...], CLIPVisionModelOutput]:
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-        feats = self.get_image_features(pixel_values=pixel_values)
-        out = CLIPVisionModelOutput(image_embeds=feats)
-        return out if return_dict else out.to_tuple()
     def forward(
         self,

 # and adjusted for Jina CLIP
 from functools import partial
+from typing import Optional, Tuple, Union, List
+import numpy as np
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
     CLIPVisionModelOutput,
     clip_loss,
 )
+try:
+    from tqdm.autonotebook import trange
+    has_tqdm = True
+except ImportError:
+    has_tqdm = False
 from .configuration_clip import JinaCLIPConfig, JinaCLIPTextConfig, JinaCLIPVisionConfig
 from .eva_model import EVAVisionTransformer
             self.visual_projection = nn.Identity()
             self.text_projection = nn.Identity()
+        self.tokenizer = None
+        self.preprocess = None
         self.post_init()
     def get_text_features(
         )
         return self.visual_projection(self.vision_model(x=x))
+    def get_tokenizer(self):
+        if not self.tokenizer:
+            self.tokenizer = AutoTokenizer.from_pretrained(config._name_or_path, trust_remote_code=True)
+        return self.tokenizer
+    @torch.inference_mode()
     def encode_text(
         self,
+        sentences: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress_bar: Optional[bool] = None,
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        device: Optional[torch.device] = None,
+        normalize_embeddings: bool = False,
+        **tokenizer_kwargs,
+    ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
+        """
+       Computes sentence embeddings
+        Args:
+            sentences(`str` or `List[str]`):
+                Sentence or sentences to be encoded
+            batch_size(`int`, *optional*, defaults to 32):
+                Batch size for the computation
+            show_progress_bar(`bool`, *optional*, defaults to None):
+                Show a progress bar when encoding sentences.
+                If set to None, progress bar is only shown when `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
+            convert_to_numpy(`bool`, *optional*, defaults to True):
+                If true, the output is a list of numpy vectors.
+                Else, it is a list of pytorch tensors.
+            convert_to_tensor(`bool`, *optional*, defaults to False):
+                If true, you get one large tensor as return.
+                Overwrites any setting from convert_to_numpy
+            device(`torch.device`, *optional*, defaults to None):
+                Which torch.device to use for the computation
+            normalize_embeddings(`bool`, *optional*, defaults to False):
+                If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+            tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
+                Keyword arguments for the tokenizer
+        Returns:
+            By default, a list of tensors is returned.
+            If convert_to_tensor, a stacked tensor is returned.
+            If convert_to_numpy, a numpy matrix is returned.
+        """
+        is_training = self.training
+        self.eval()
+        self.tokenizer = self.get_tokenizer()
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO
+                or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        if convert_to_tensor:
+            convert_to_numpy = False
+        input_was_string = False
+        if isinstance(sentences, str) or not hasattr(sentences, '__len__'):
+            sentences = [sentences]
+            input_was_string = True
+        if device is not None:
+            self.to(device)
+        permutation = np.argsort([-len(i) for i in sentences])
+        inverse_permutation = np.argsort(permutation)
+        sentences = [sentences[idx] for idx in permutation]
+        tokenizer_kwargs['padding'] = tokenizer_kwargs.get('padding', True)
+        tokenizer_kwargs['max_length'] = tokenizer_kwargs.get('max_length', 512)
+        tokenizer_kwargs['truncation'] = tokenizer_kwargs.get('truncation', True)
+        if has_tqdm:
+            range_iter = trange(
+                0,
+                len(sentences),
+                batch_size,
+                desc="Encoding",
+                disable=not show_progress_bar,
+            )
+        else:
+            range_iter = range(0, len(sentences), batch_size)
+        for i in range_iter:
+            encoded_input = self.tokenizer(
+                sentences[i : i + batch_size],
+                return_tensors='pt',
+                **tokenizer_kwargs,
+            ).to(self.device)
+            embeddings = self.get_text_features(input_ids=encoded_input)
+            if normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+            if convert_to_numpy:
+                embeddings = embeddings.cpu()
+            all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
+        if convert_to_tensor:
+            all_embeddings = torch.stack(all_embeddings)
+        elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+        if input_was_string:
+            all_embeddings = all_embeddings[0]
+        self.train(is_training)
+        return all_embeddings
+    def get_preprocess(self):
+        if not self.preprocess:
+            self.preprocess = AutoImageProcessor.from_pretrained(config._name_or_path, trust_remote_code=True)
+        return self.preprocess
+    @torch.inference_mode()
     def encode_image(
         self,
+        images: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress_bar: Optional[bool] = None,
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        device: Optional[torch.device] = None,
+        normalize_embeddings: bool = False,
+    ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
+        """
+        Computes image embeddings.
+        Args:
+            images(`str` or `List[str]`):
+                image or images paths to be encoded
+            batch_size(`int`, *optional*, defaults to 32):
+                Batch size for the computation
+            show_progress_bar(`bool`, *optional*, defaults to None):
+                Show a progress bar when encoding images.
+                If set to None, progress bar is only shown when `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
+            convert_to_numpy(`bool`, *optional*, defaults to True):
+                If true, the output is a list of numpy vectors.
+                Else, it is a list of pytorch tensors.
+            convert_to_tensor(`bool`, *optional*, defaults to False):
+                If true, you get one large tensor as return.
+                Overwrites any setting from convert_to_numpy
+            device(`torch.device`, *optional*, defaults to None):
+                Which torch.device to use for the computation
+            normalize_embeddings(`bool`, *optional*, defaults to False):
+                If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+        Returns:
+            By default, a list of tensors is returned.
+            If convert_to_tensor, a stacked tensor is returned.
+            If convert_to_numpy, a numpy matrix is returned.
+        """
+        from PIL.Image import Image
+        is_training = self.training
+        self.eval()
+        self.preprocess = self.get_preprocess()
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO
+                or logger.getEffectiveLevel() == logging.DEBUG
+            )
+        if convert_to_tensor:
+            convert_to_numpy = False
+        input_was_single_img = False
+        if isinstance(images, str) or not hasattr(images, '__len__'):
+            images = [images]
+            input_was_single_img = True
+        if device is not None:
+            self.to(device)
+        permutation = np.argsort([-len(i) for i in images])
+        inverse_permutation = np.argsort(permutation)
+        images = [images[idx] for idx in permutation]
+        if has_tqdm:
+            range_iter = trange(
+                0,
+                len(images),
+                batch_size,
+                desc="Encoding",
+                disable=not show_progress_bar,
+            )
+        else:
+            range_iter = range(0, len(images), batch_size)
+        for i in range_iter:
+            processed_inputs = self.process([Image.open(image) for image in images])
+            embeddings = self.get_image_features(processed_inputs)
+            if normalize_embeddings:
+                embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+            if convert_to_numpy:
+                embeddings = embeddings.cpu()
+            all_embeddings.extend(embeddings)
+        all_embeddings = [all_embeddings[idx] for idx in inverse_permutation]
+        if convert_to_tensor:
+            all_embeddings = torch.stack(all_embeddings)
+        elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+        if input_was_single_img:
+            all_embeddings = all_embeddings[0]
+        self.train(is_training)
+        return all_embeddings
     def forward(
         self,