jinaai
/

jina-clip-implementation

Transformers

xlm-roberta

eva02

clip

Inference Endpoints

🇪🇺 Region: EU

Model card Files Files and versions Community

gmastrapas commited on May 30, 2024

Commit

cd1adcb

1 Parent(s): 4ed2f34

style: apply ruff & isort

Browse files

Files changed (1) hide show

modeling_clip.py +62 -46

modeling_clip.py CHANGED Viewed

@@ -5,20 +5,28 @@
 # and adjusted for Jina CLIP
 from functools import partial
-from typing import Optional, Tuple, Union, List
 import numpy as np
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
 from torch import nn
-from transformers import BatchEncoding, BatchFeature, PreTrainedModel, logging, AutoImageProcessor, AutoTokenizer
 from transformers.models.clip.modeling_clip import (
     CLIPOutput,
     CLIPTextModelOutput,
     CLIPVisionModelOutput,
     clip_loss,
 )
 try:
     from tqdm.autonotebook import trange
@@ -226,6 +234,20 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         self.preprocess = None
         self.post_init()
     def get_text_features(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
@@ -248,11 +270,6 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         )
         return self.visual_projection(self.vision_model(x=x))
-    def get_tokenizer(self):
-        if not self.tokenizer:
-            self.tokenizer = AutoTokenizer.from_pretrained(self.config._name_or_path, trust_remote_code=True)
-        return self.tokenizer
     @torch.inference_mode()
     def encode_text(
         self,
@@ -266,38 +283,41 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
-       Computes sentence embeddings
-        Args:
-            sentences(`str` or `List[str]`):
-                Sentence or sentences to be encoded
-            batch_size(`int`, *optional*, defaults to 32):
-                Batch size for the computation
-            show_progress_bar(`bool`, *optional*, defaults to None):
-                Show a progress bar when encoding sentences.
-                If set to None, progress bar is only shown when `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
-            convert_to_numpy(`bool`, *optional*, defaults to True):
-                If true, the output is a list of numpy vectors.
-                Else, it is a list of pytorch tensors.
-            convert_to_tensor(`bool`, *optional*, defaults to False):
-                If true, you get one large tensor as return.
-                Overwrites any setting from convert_to_numpy
-            device(`torch.device`, *optional*, defaults to None):
-                Which torch.device to use for the computation
-            normalize_embeddings(`bool`, *optional*, defaults to False):
-                If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
-            tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
-                Keyword arguments for the tokenizer
-        Returns:
-            By default, a list of tensors is returned.
-            If convert_to_tensor, a stacked tensor is returned.
-            If convert_to_numpy, a numpy matrix is returned.
         """
         is_training = self.training
         self.eval()
         all_embeddings = []
         self.tokenizer = self.get_tokenizer()
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
@@ -328,7 +348,7 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
                 0,
                 len(sentences),
                 batch_size,
-                desc="Encoding",
                 disable=not show_progress_bar,
             )
         else:
@@ -361,13 +381,6 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
         self.train(is_training)
         return all_embeddings
-    def get_preprocess(self):
-        if not self.preprocess:
-            self.preprocess = AutoImageProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
-        return self.preprocess
     @torch.inference_mode()
     def encode_image(
         self,
@@ -389,7 +402,8 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
                 Show a progress bar when encoding images.
-                If set to None, progress bar is only shown when `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
             convert_to_numpy(`bool`, *optional*, defaults to True):
                 If true, the output is a list of numpy vectors.
                 Else, it is a list of pytorch tensors.
@@ -399,14 +413,16 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
             normalize_embeddings(`bool`, *optional*, defaults to False):
-                If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
         Returns:
             By default, a list of tensors is returned.
             If convert_to_tensor, a stacked tensor is returned.
             If convert_to_numpy, a numpy matrix is returned.
         """
         from PIL import Image
         is_training = self.training
         self.eval()
@@ -439,13 +455,13 @@ class JinaCLIPModel(JinaCLIPPreTrainedModel):
                 0,
                 len(images),
                 batch_size,
-                desc="Encoding",
                 disable=not show_progress_bar,
             )
         else:
             range_iter = range(0, len(images), batch_size)
-        for i in range_iter:
             processed_inputs = self.preprocess([Image.open(image) for image in images])
             embeddings = self.get_image_features(processed_inputs)
             if normalize_embeddings:

 # and adjusted for Jina CLIP
 from functools import partial
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.nn.functional as f
 import torch.utils.checkpoint
 from torch import nn
+from transformers import (
+    AutoImageProcessor,
+    AutoTokenizer,
+    BatchEncoding,
+    BatchFeature,
+    PreTrainedModel,
+    logging,
+)
 from transformers.models.clip.modeling_clip import (
     CLIPOutput,
     CLIPTextModelOutput,
     CLIPVisionModelOutput,
     clip_loss,
 )
 try:
     from tqdm.autonotebook import trange
         self.preprocess = None
         self.post_init()
+    def get_tokenizer(self):
+        if not self.tokenizer:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.config._name_or_path, trust_remote_code=True
+            )
+        return self.tokenizer
+    def get_preprocess(self):
+        if not self.preprocess:
+            self.preprocess = AutoImageProcessor.from_pretrained(
+                self.config._name_or_path, trust_remote_code=True
+            )
+        return self.preprocess
     def get_text_features(
         self,
         input_ids: Union[None, torch.Tensor, BatchEncoding] = None,
         )
         return self.visual_projection(self.vision_model(x=x))
     @torch.inference_mode()
     def encode_text(
         self,
         **tokenizer_kwargs,
     ) -> Union[List[torch.Tensor], np.ndarray, torch.Tensor]:
         """
+        Computes sentence embeddings
+         Args:
+             sentences(`str` or `List[str]`):
+                 Sentence or sentences to be encoded
+             batch_size(`int`, *optional*, defaults to 32):
+                 Batch size for the computation
+             show_progress_bar(`bool`, *optional*, defaults to None):
+                 Show a progress bar when encoding sentences.
+                 If set to None, progress bar is only shown when
+                 `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
+             convert_to_numpy(`bool`, *optional*, defaults to True):
+                 If true, the output is a list of numpy vectors.
+                 Else, it is a list of pytorch tensors.
+             convert_to_tensor(`bool`, *optional*, defaults to False):
+                 If true, you get one large tensor as return.
+                 Overwrites any setting from convert_to_numpy
+             device(`torch.device`, *optional*, defaults to None):
+                 Which torch.device to use for the computation
+             normalize_embeddings(`bool`, *optional*, defaults to False):
+                 If set to true, returned vectors will have length 1. In that case,
+                 the faster dot-product (util.dot_score) instead of cosine similarity
+                 can be used.
+             tokenizer_kwargs(`Dict[str, Any]`, *optional*, defaults to {}):
+                 Keyword arguments for the tokenizer
+         Returns:
+             By default, a list of tensors is returned.
+             If convert_to_tensor, a stacked tensor is returned.
+             If convert_to_numpy, a numpy matrix is returned.
         """
         is_training = self.training
         self.eval()
         all_embeddings = []
         self.tokenizer = self.get_tokenizer()
         if show_progress_bar is None:
             show_progress_bar = (
                 logger.getEffectiveLevel() == logging.INFO
                 0,
                 len(sentences),
                 batch_size,
+                desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
         self.train(is_training)
         return all_embeddings
     @torch.inference_mode()
     def encode_image(
         self,
                 Batch size for the computation
             show_progress_bar(`bool`, *optional*, defaults to None):
                 Show a progress bar when encoding images.
+                If set to None, progress bar is only shown when
+                `logger.level == logging.INFO` or `logger.level == logging.DEBUG`.
             convert_to_numpy(`bool`, *optional*, defaults to True):
                 If true, the output is a list of numpy vectors.
                 Else, it is a list of pytorch tensors.
             device(`torch.device`, *optional*, defaults to None):
                 Which torch.device to use for the computation
             normalize_embeddings(`bool`, *optional*, defaults to False):
+                If set to true, returned vectors will have length 1. In that case,
+                the faster dot-product (util.dot_score) instead of cosine similarity
+                can be used.
         Returns:
             By default, a list of tensors is returned.
             If convert_to_tensor, a stacked tensor is returned.
             If convert_to_numpy, a numpy matrix is returned.
         """
         from PIL import Image
         is_training = self.training
         self.eval()
                 0,
                 len(images),
                 batch_size,
+                desc='Encoding',
                 disable=not show_progress_bar,
             )
         else:
             range_iter = range(0, len(images), batch_size)
+        for _ in range_iter:
             processed_inputs = self.preprocess([Image.open(image) for image in images])
             embeddings = self.get_image_features(processed_inputs)
             if normalize_embeddings: