Safetensors
custom_code
kyusonglee commited on
Commit
d542e6b
·
verified ·
1 Parent(s): e4f984c

Upload 6 files

Browse files
configuration_omchat.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import LlamaConfig, PretrainedConfig
3
+ from transformers.utils import logging
4
+ from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM, AutoConfig, AutoModelForCausalLM
5
+
6
+
7
+ logger = logging.get_logger(__name__)
8
+
9
+ class InternVisionConfig(PretrainedConfig):
10
+ r"""
11
+ This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
12
+ instantiate a vision encoder according to the specified arguments, defining the model architecture.
13
+
14
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
15
+ documentation from [`PretrainedConfig`] for more information.
16
+
17
+ Args:
18
+ num_channels (`int`, *optional*, defaults to 3):
19
+ Number of color channels in the input images (e.g., 3 for RGB).
20
+ patch_size (`int`, *optional*, defaults to 14):
21
+ The size (resolution) of each patch.
22
+ image_size (`int`, *optional*, defaults to 224):
23
+ The size (resolution) of each image.
24
+ qkv_bias (`bool`, *optional*, defaults to `False`):
25
+ Whether to add a bias to the queries and values in the self-attention layers.
26
+ hidden_size (`int`, *optional*, defaults to 3200):
27
+ Dimensionality of the encoder layers and the pooler layer.
28
+ num_attention_heads (`int`, *optional*, defaults to 25):
29
+ Number of attention heads for each attention layer in the Transformer encoder.
30
+ intermediate_size (`int`, *optional*, defaults to 12800):
31
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
32
+ qk_normalization (`bool`, *optional*, defaults to `True`):
33
+ Whether to normalize the queries and keys in the self-attention layers.
34
+ num_hidden_layers (`int`, *optional*, defaults to 48):
35
+ Number of hidden layers in the Transformer encoder.
36
+ use_flash_attn (`bool`, *optional*, defaults to `True`):
37
+ Whether to use flash attention mechanism.
38
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
39
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
40
+ `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
41
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
42
+ The epsilon used by the layer normalization layers.
43
+ dropout (`float`, *optional*, defaults to 0.0):
44
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
45
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
46
+ Dropout rate for stochastic depth.
47
+ attention_dropout (`float`, *optional*, defaults to 0.0):
48
+ The dropout ratio for the attention probabilities.
49
+ initializer_range (`float`, *optional*, defaults to 0.02):
50
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
51
+ initializer_factor (`float`, *optional*, defaults to 0.1):
52
+ A factor for layer scale.
53
+ """
54
+
55
+ model_type = 'intern_vit_6b'
56
+
57
+ def __init__(
58
+ self,
59
+ num_channels=3,
60
+ patch_size=14,
61
+ image_size=448,
62
+ qkv_bias=False,
63
+ hidden_size=3200,
64
+ num_attention_heads=25,
65
+ intermediate_size=12800,
66
+ qk_normalization=True,
67
+ num_hidden_layers=45,
68
+ use_flash_attn=True,
69
+ hidden_act='gelu',
70
+ layer_norm_eps=1e-6,
71
+ dropout=0.0,
72
+ drop_path_rate=0.0,
73
+ attention_dropout=0.0,
74
+ initializer_range=1e-10,
75
+ initializer_factor=0.1,
76
+ **kwargs,
77
+ ):
78
+ super().__init__(**kwargs)
79
+
80
+ self.hidden_size = hidden_size
81
+ self.intermediate_size = intermediate_size
82
+ self.dropout = dropout
83
+ self.drop_path_rate = drop_path_rate
84
+ self.num_hidden_layers = num_hidden_layers
85
+ self.num_attention_heads = num_attention_heads
86
+ self.num_channels = num_channels
87
+ self.patch_size = patch_size
88
+ self.image_size = image_size
89
+ self.initializer_range = initializer_range
90
+ self.initializer_factor = initializer_factor
91
+ self.attention_dropout = attention_dropout
92
+ self.layer_norm_eps = layer_norm_eps
93
+ self.hidden_act = hidden_act
94
+ self.qkv_bias = qkv_bias
95
+ self.qk_normalization = qk_normalization
96
+ self.use_flash_attn = use_flash_attn
97
+
98
+
99
+ class OmChatConfig(PretrainedConfig):
100
+ r"""
101
+ This is the configuration class to store the configuration of a [`OmChatForConditionalGeneration`]. It is used to instantiate an
102
+ Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
103
+ with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
104
+ model.
105
+
106
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
107
+ documentation from [`PretrainedConfig`] for more information.
108
+
109
+ Args:
110
+ vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`):
111
+ The config object or dictionary of the vision backbone.
112
+ text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
113
+ The config object or dictionary of the text backbone.
114
+ ignore_index (`int`, *optional*, defaults to -100):
115
+ The ignore index for the loss function.
116
+ image_token_index (`int`, *optional*, defaults to 32000):
117
+ The image token index to encode the image prompt.
118
+ projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
119
+ The activation function used by the multimodal projector.
120
+ vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
121
+ The feature selection strategy used to select the vision feature from the vision backbone.
122
+ Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features.
123
+ If `"full"`, the full vision features are used.
124
+ vision_feature_layer (`int`, *optional*, defaults to -2):
125
+ The index of the layer to select the vision feature.
126
+ image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
127
+ A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
128
+ of the form `(height, width)`.
129
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
130
+ Whether the model's input and output word embeddings should be tied.
131
+
132
+ Example:
133
+
134
+ ```python
135
+ >>> from transformers import OmChatForConditionalGeneration, OmChatConfig, CLIPVisionConfig, LlamaConfig
136
+
137
+ >>> # Initializing a CLIP-vision config
138
+ >>> vision_config = CLIPVisionConfig()
139
+
140
+ >>> # Initializing a Llama config
141
+ >>> text_config = LlamaConfig()
142
+
143
+ >>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
144
+ >>> configuration = OmChatConfig(vision_config, text_config)
145
+
146
+ >>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
147
+ >>> model = OmChatForConditionalGeneration(configuration)
148
+
149
+ >>> # Accessing the model configuration
150
+ >>> configuration = model.config
151
+ ```"""
152
+
153
+ model_type = "omchat"
154
+ is_composition = False
155
+
156
+ def __init__(
157
+ self,
158
+ vision_config=None,
159
+ text_config=None,
160
+ ignore_index=-100,
161
+ image_token_index=32000,
162
+ projector_hidden_act="gelu",
163
+ vision_feature_select_strategy="default",
164
+ vision_feature_layer=-1,
165
+ image_grid_pinpoints=None,
166
+ tie_word_embeddings=False,
167
+ **kwargs,
168
+ ):
169
+ self.ignore_index = ignore_index
170
+ self.image_token_index = image_token_index
171
+ self.projector_hidden_act = projector_hidden_act
172
+
173
+ if vision_feature_select_strategy not in ["default", "full"]:
174
+ raise ValueError(
175
+ "vision_feature_select_strategy should be one of 'default', 'full'."
176
+ f"Got: {vision_feature_select_strategy}"
177
+ )
178
+
179
+ self.vision_feature_select_strategy = vision_feature_select_strategy
180
+ self.vision_feature_layer = vision_feature_layer
181
+ image_grid_pinpoints = (
182
+ image_grid_pinpoints
183
+ if image_grid_pinpoints is not None
184
+ else [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]
185
+ )
186
+ self.image_grid_pinpoints = image_grid_pinpoints
187
+
188
+ if isinstance(vision_config, dict):
189
+
190
+ vision_config = InternVisionConfig(**vision_config)
191
+ self.vision_config = vision_config
192
+
193
+ if isinstance(text_config, dict):
194
+ text_config = Qwen2Config(**text_config)
195
+
196
+ self.text_config = text_config
197
+
198
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151643,
4
+ "eos_token_id": 151645,
5
+ "transformers_version": "4.41.2"
6
+ }
image_processing_omchat.py ADDED
@@ -0,0 +1,733 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import math
3
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+
7
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
8
+ from transformers.image_transforms import (
9
+ PaddingMode,
10
+ convert_to_rgb,
11
+ get_resize_output_image_size,
12
+ pad,
13
+ resize,
14
+ to_channel_dimension_format,
15
+ )
16
+ from transformers.image_utils import (
17
+ OPENAI_CLIP_MEAN,
18
+ OPENAI_CLIP_STD,
19
+ ChannelDimension,
20
+ ImageInput,
21
+ PILImageResampling,
22
+ get_image_size,
23
+ infer_channel_dimension_format,
24
+ is_scaled_image,
25
+ is_valid_image,
26
+ make_list_of_images,
27
+ to_numpy_array,
28
+ valid_images,
29
+ validate_preprocess_arguments,
30
+ )
31
+ from transformers.utils import TensorType, is_vision_available, logging
32
+
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+
37
+ if is_vision_available():
38
+ from PIL import Image
39
+
40
+
41
+ def make_batched_images(images) -> List[List[ImageInput]]:
42
+ """
43
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
44
+
45
+ Args:
46
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
47
+ The input image.
48
+
49
+ Returns:
50
+ list: A list of images.
51
+ """
52
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
53
+ return [img for img_list in images for img in img_list]
54
+
55
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
56
+ return images
57
+
58
+ elif is_valid_image(images):
59
+ return [images]
60
+
61
+ raise ValueError(f"Could not make batched video from {images}")
62
+
63
+
64
+ def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
65
+ """
66
+ Divides an image into patches of a specified size.
67
+
68
+ Args:
69
+ image (`np.array`):
70
+ The input image.
71
+ patch_size (`int`):
72
+ The size of each patch.
73
+ input_data_format (`ChannelDimension` or `str`):
74
+ The channel dimension format of the input image.
75
+
76
+ Returns:
77
+ list: A list of np.array representing the patches.
78
+ """
79
+ patches = []
80
+ height, width = get_image_size(image, channel_dim=input_data_format)
81
+ for i in range(0, height, patch_size):
82
+ for j in range(0, width, patch_size):
83
+ if input_data_format == ChannelDimension.LAST:
84
+ patch = image[i : i + patch_size, j : j + patch_size]
85
+ else:
86
+ patch = image[:, i : i + patch_size, j : j + patch_size]
87
+ patches.append(patch)
88
+
89
+ return patches
90
+
91
+
92
+ def expand_to_square(image: np.array, background_color, input_data_format) -> np.array:
93
+ """
94
+ Expands an image to a square by adding a background color.
95
+ """
96
+
97
+ height, width = get_image_size(image, channel_dim=input_data_format)
98
+ if width == height:
99
+ return image
100
+ elif width > height:
101
+ result = np.ones((width, width, image.shape[2]), dtype=image.dtype) * background_color
102
+ result[(width - height) // 2 : (width - height) // 2 + height, :] = image
103
+ return result
104
+ else:
105
+ result = np.ones((height, height, image.shape[2]), dtype=image.dtype) * background_color
106
+ result[:, (height - width) // 2 : (height - width) // 2 + width] = image
107
+ return result
108
+
109
+
110
+ def _get_patch_output_size(image, target_resolution, input_data_format):
111
+ original_height, original_width = get_image_size(image, channel_dim=input_data_format)
112
+ target_height, target_width = target_resolution
113
+
114
+ scale_w = target_width / original_width
115
+ scale_h = target_height / original_height
116
+
117
+ if scale_w < scale_h:
118
+ new_width = target_width
119
+ new_height = min(math.ceil(original_height * scale_w), target_height)
120
+ else:
121
+ new_height = target_height
122
+ new_width = min(math.ceil(original_width * scale_h), target_width)
123
+
124
+ return new_height, new_width
125
+
126
+
127
+ class OmChatImageProcessor(BaseImageProcessor):
128
+ r"""
129
+ Constructs a LLaVa-NeXT image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
130
+ for processing high resolution images as explained in the [LLaVa paper](https://arxiv.org/abs/2310.03744).
131
+
132
+ Args:
133
+ do_resize (`bool`, *optional*, defaults to `True`):
134
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
135
+ `do_resize` in the `preprocess` method.
136
+ size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
137
+ Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
138
+ the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
139
+ method.
140
+ image_grid_pinpoints (`List` *optional*, defaults to `[[896, 448], [448, 896], [896, 896], [448, 1344], [1344, 448]]`):
141
+ A list of possible resolutions to use for processing high resolution images. The best resolution is selected
142
+ based on the original size of the image. Can be overridden by `image_grid_pinpoints` in the `preprocess`
143
+ method.
144
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
145
+ Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
146
+ do_center_crop (`bool`, *optional*, defaults to `True`):
147
+ Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
148
+ `preprocess` method.
149
+ crop_size (`Dict[str, int]` *optional*, defaults to 224):
150
+ Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
151
+ method.
152
+ do_rescale (`bool`, *optional*, defaults to `True`):
153
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
154
+ the `preprocess` method.
155
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
156
+ Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
157
+ method.
158
+ do_normalize (`bool`, *optional*, defaults to `True`):
159
+ Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
160
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
161
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
162
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
163
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
164
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
165
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
166
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
167
+ do_pad (`bool`, *optional*, defaults to `True`):
168
+ Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
169
+ number of patches in the batch. Padding will be applied to the bottom and right with zeros.
170
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
171
+ Whether to convert the image to RGB.
172
+ """
173
+
174
+ model_input_names = ["pixel_values"]
175
+
176
+ def __init__(
177
+ self,
178
+ do_resize: bool = True,
179
+ size: Dict[str, int] = None,
180
+ image_grid_pinpoints: List = None,
181
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
182
+ do_center_crop: bool = True,
183
+ crop_size: Dict[str, int] = None,
184
+ do_rescale: bool = True,
185
+ rescale_factor: Union[int, float] = 1 / 255,
186
+ do_normalize: bool = True,
187
+ image_mean: Optional[Union[float, List[float]]] = [0.485, 0.456, 0.406],
188
+ image_std: Optional[Union[float, List[float]]] = [0.229, 0.224, 0.225],
189
+ do_convert_rgb: bool = True,
190
+ **kwargs,
191
+ ) -> None:
192
+ super().__init__(**kwargs)
193
+ size = size if size is not None else {"shortest_edge": 448}
194
+ size = get_size_dict(size, default_to_square=False)
195
+ image_grid_pinpoints = (
196
+ image_grid_pinpoints
197
+ if image_grid_pinpoints is not None
198
+ else [[448, 896], [896, 448], [896, 896], [1344, 448], [448, 1344],[1344, 1344]]
199
+ )
200
+ crop_size = crop_size if crop_size is not None else {"height": 448, "width": 448}
201
+ crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
202
+
203
+ self.do_resize = do_resize
204
+ self.size = size
205
+ self.image_grid_pinpoints = image_grid_pinpoints
206
+ self.resample = resample
207
+ self.do_center_crop = do_center_crop
208
+ self.crop_size = crop_size
209
+ self.do_rescale = do_rescale
210
+ self.rescale_factor = rescale_factor
211
+ self.do_normalize = do_normalize
212
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
213
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
214
+ self.do_convert_rgb = do_convert_rgb
215
+
216
+ # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize with CLIP->LLaVa
217
+ def resize(
218
+ self,
219
+ image: np.ndarray,
220
+ size: Dict[str, int],
221
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
222
+ data_format: Optional[Union[str, ChannelDimension]] = None,
223
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
224
+ **kwargs,
225
+ ) -> np.ndarray:
226
+ """
227
+ Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
228
+ resized to keep the input aspect ratio.
229
+
230
+ Args:
231
+ image (`np.ndarray`):
232
+ Image to resize.
233
+ size (`Dict[str, int]`):
234
+ Size of the output image.
235
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
236
+ Resampling filter to use when resiizing the image.
237
+ data_format (`str` or `ChannelDimension`, *optional*):
238
+ The channel dimension format of the image. If not provided, it will be the same as the input image.
239
+ input_data_format (`ChannelDimension` or `str`, *optional*):
240
+ The channel dimension format of the input image. If not provided, it will be inferred.
241
+ """
242
+ default_to_square = True
243
+ if "shortest_edge" in size:
244
+ size = size["shortest_edge"]
245
+ default_to_square = False
246
+ elif "height" in size and "width" in size:
247
+ size = (size["height"], size["width"])
248
+ else:
249
+ raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
250
+
251
+ output_size = get_resize_output_image_size(
252
+ image,
253
+ size=size,
254
+ default_to_square=default_to_square,
255
+ input_data_format=input_data_format,
256
+ )
257
+
258
+ return resize(
259
+ image,
260
+ size=output_size,
261
+ resample=resample,
262
+ data_format=data_format,
263
+ input_data_format=input_data_format,
264
+ **kwargs,
265
+ )
266
+
267
+ def pad(
268
+ self,
269
+ image: np.ndarray,
270
+ padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
271
+ mode: PaddingMode = PaddingMode.CONSTANT,
272
+ constant_values: Union[float, Iterable[float]] = 0.0,
273
+ data_format: Optional[Union[str, ChannelDimension]] = None,
274
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
275
+ ) -> np.ndarray:
276
+ """
277
+ Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`)
278
+ dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected
279
+ as input.
280
+
281
+ Args:
282
+ image (`np.ndarray`):
283
+ The image to pad.
284
+ padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
285
+ Padding to apply to the edges of the height, width axes. Can be one of three formats:
286
+ - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
287
+ - `((before, after),)` yields same before and after pad for height and width.
288
+ - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
289
+ mode (`PaddingMode`):
290
+ The padding mode to use. Can be one of:
291
+ - `"constant"`: pads with a constant value.
292
+ - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
293
+ vector along each axis.
294
+ - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
295
+ - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
296
+ constant_values (`float` or `Iterable[float]`, *optional*):
297
+ The value to use for the padding if `mode` is `"constant"`.
298
+ data_format (`str` or `ChannelDimension`, *optional*):
299
+ The channel dimension format for the output image. Can be one of:
300
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
301
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
302
+ If unset, will use same as the input image.
303
+ input_data_format (`str` or `ChannelDimension`, *optional*):
304
+ The channel dimension format for the input image. Can be one of:
305
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
306
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
307
+ If unset, will use the inferred format of the input image.
308
+
309
+ Returns:
310
+ `np.ndarray`: The padded image.
311
+
312
+ """
313
+
314
+ # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
315
+ if isinstance(padding, int) or len(padding) != 4:
316
+ return pad(image, padding, mode, constant_values, data_format, input_data_format)
317
+
318
+ if input_data_format is None:
319
+ input_data_format = infer_channel_dimension_format(image)
320
+ if mode == PaddingMode.CONSTANT:
321
+ image = np.pad(image, padding, mode="constant", constant_values=constant_values)
322
+ elif mode == PaddingMode.REFLECT:
323
+ image = np.pad(image, padding, mode="reflect")
324
+ elif mode == PaddingMode.REPLICATE:
325
+ image = np.pad(image, padding, mode="edge")
326
+ elif mode == PaddingMode.SYMMETRIC:
327
+ image = np.pad(image, padding, mode="symmetric")
328
+ else:
329
+ raise ValueError(f"Invalid padding mode: {mode}")
330
+ image = (
331
+ to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
332
+ )
333
+ return image
334
+
335
+ def _preprocess(
336
+ self,
337
+ images: ImageInput,
338
+ do_resize: bool = None,
339
+ size: Dict[str, int] = None,
340
+ resample: PILImageResampling = None,
341
+ do_center_crop: bool = None,
342
+ crop_size: int = None,
343
+ do_rescale: bool = None,
344
+ rescale_factor: float = None,
345
+ do_normalize: bool = None,
346
+ image_mean: Optional[Union[float, List[float]]] = None,
347
+ image_std: Optional[Union[float, List[float]]] = None,
348
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
349
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
350
+ ) -> Image.Image:
351
+ """
352
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
353
+
354
+ Args:
355
+ images (`ImageInput`):
356
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
357
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
358
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
359
+ Whether to resize the image.
360
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
361
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
362
+ the longest edge resized to keep the input aspect ratio.
363
+ resample (`int`, *optional*, defaults to `self.resample`):
364
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
365
+ has an effect if `do_resize` is set to `True`.
366
+ do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
367
+ Whether to center crop the image.
368
+ crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
369
+ Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
370
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
371
+ Whether to rescale the image.
372
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
373
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
374
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
375
+ Whether to normalize the image.
376
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
377
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
378
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
379
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
380
+ `True`.
381
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
382
+ The channel dimension format for the output image. Can be one of:
383
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
384
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
385
+ - Unset: Use the channel dimension format of the input image.
386
+ input_data_format (`ChannelDimension` or `str`, *optional*):
387
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
388
+ from the input image. Can be one of:
389
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
390
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
391
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
392
+ """
393
+ images = make_list_of_images(images)
394
+
395
+ if do_resize:
396
+ images = [
397
+ self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
398
+ for image in images
399
+ ]
400
+
401
+ if do_center_crop:
402
+ images = [
403
+ self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
404
+ ]
405
+
406
+ if do_rescale:
407
+ images = [
408
+ self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
409
+ for image in images
410
+ ]
411
+
412
+ if do_normalize:
413
+ images = [
414
+ self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
415
+ for image in images
416
+ ]
417
+
418
+ images = [
419
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
420
+ ]
421
+
422
+ return images
423
+
424
+ def _resize_for_patching(
425
+ self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension
426
+ ) -> np.array:
427
+ """
428
+ Resizes an image to a target resolution while maintaining aspect ratio.
429
+
430
+ Args:
431
+ image (np.array):
432
+ The input image.
433
+ target_resolution (tuple):
434
+ The target resolution (height, width) of the image.
435
+ resample (`PILImageResampling`):
436
+ Resampling filter to use if resizing the image.
437
+ input_data_format (`ChannelDimension` or `str`):
438
+ The channel dimension format of the input image.
439
+
440
+ Returns:
441
+ np.array: The resized and padded image.
442
+ """
443
+ new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
444
+
445
+ # Resize the image
446
+ resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format)
447
+
448
+ return resized_image
449
+
450
+ def _pad_for_patching(
451
+ self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension
452
+ ) -> np.array:
453
+ """
454
+ Pad an image to a target resolution while maintaining aspect ratio.
455
+ """
456
+ target_height, target_width = target_resolution
457
+ new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format)
458
+
459
+ paste_x = (target_width - new_width) // 2
460
+ paste_y = (target_height - new_height) // 2
461
+
462
+ padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x)))
463
+
464
+ return padded_image
465
+
466
+ def get_image_patches(
467
+ self,
468
+ image: np.array,
469
+ grid_pinpoints,
470
+ size: tuple,
471
+ patch_size: int,
472
+ resample: PILImageResampling,
473
+ data_format: ChannelDimension,
474
+ input_data_format: ChannelDimension,
475
+ ) -> List[np.array]:
476
+ """
477
+ Process an image with variable resolutions by dividing it into patches.
478
+
479
+ Args:
480
+ image (np.array):
481
+ The input image to be processed.
482
+ grid_pinpoints (List):
483
+ A string representation of a list of possible resolutions.
484
+ size (`tuple`):
485
+ Size to resize the original image to.
486
+ patch_size (`int`):
487
+ Size of the patches to divide the image into.
488
+ resample (`PILImageResampling`):
489
+ Resampling filter to use if resizing the image.
490
+ data_format (`ChannelDimension` or `str`):
491
+ The channel dimension format for the output image.
492
+ input_data_format (`ChannelDimension` or `str`):
493
+ The channel dimension format of the input image.
494
+
495
+ Returns:
496
+ List[np.array]: A list of NumPy arrays containing the processed image patches.
497
+ """
498
+ if not isinstance(grid_pinpoints, list):
499
+ raise TypeError("grid_pinpoints must be a list of possible resolutions.")
500
+
501
+ possible_resolutions = grid_pinpoints
502
+
503
+ image_size = get_image_size(image, channel_dim=input_data_format)
504
+ best_resolution = select_best_resolution(image_size, possible_resolutions)
505
+ resized_image = self._resize_for_patching(
506
+ image, best_resolution, resample=resample, input_data_format=input_data_format
507
+ )
508
+ padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format)
509
+
510
+ patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format)
511
+
512
+ # make sure that all patches are in the input data format
513
+ patches = [
514
+ to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format)
515
+ for patch in patches
516
+ ]
517
+
518
+ resized_original_image = resize(
519
+ image,
520
+ size=size,
521
+ resample=resample,
522
+ data_format=data_format,
523
+ input_data_format=input_data_format,
524
+ )
525
+
526
+ image_patches = [resized_original_image] + patches
527
+
528
+ return image_patches
529
+
530
+ def _pad_for_batching(
531
+ self,
532
+ pixel_values: List[np.ndarray],
533
+ data_format: Optional[Union[str, ChannelDimension]] = None,
534
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
535
+ ):
536
+ """
537
+ Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
538
+
539
+ Args:
540
+ pixel_values (`List[np.ndarray]`):
541
+ An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
542
+ data_format (`str` or `ChannelDimension`, *optional*):
543
+ The channel dimension format for the output image. Can be one of:
544
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
545
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
546
+ If unset, will use same as the input image.
547
+ input_data_format (`str` or `ChannelDimension`, *optional*):
548
+ The channel dimension format for the input image. Can be one of:
549
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
550
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
551
+ If unset, will use the inferred format of the input image.
552
+
553
+ Returns:
554
+ List[`np.ndarray`]: The padded images.
555
+ """
556
+ max_patch = max(len(x) for x in pixel_values)
557
+ pixel_values = [
558
+ self.pad(
559
+ image,
560
+ padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)),
561
+ data_format=data_format,
562
+ input_data_format=input_data_format,
563
+ )
564
+ for image in pixel_values
565
+ ]
566
+
567
+ return pixel_values
568
+
569
+ def preprocess(
570
+ self,
571
+ images: ImageInput,
572
+ do_resize: bool = None,
573
+ size: Dict[str, int] = None,
574
+ image_grid_pinpoints: List = None,
575
+ resample: PILImageResampling = None,
576
+ do_center_crop: bool = None,
577
+ crop_size: int = None,
578
+ do_rescale: bool = None,
579
+ rescale_factor: float = None,
580
+ do_normalize: bool = None,
581
+ image_mean: Optional[Union[float, List[float]]] = None,
582
+ image_std: Optional[Union[float, List[float]]] = None,
583
+ do_convert_rgb: bool = None,
584
+ return_tensors: Optional[Union[str, TensorType]] = None,
585
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
586
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
587
+ ):
588
+ """
589
+ Args:
590
+ images (`ImageInput`):
591
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
592
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
593
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
594
+ Whether to resize the image.
595
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
596
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
597
+ the longest edge resized to keep the input aspect ratio.
598
+ image_grid_pinpoints (`List` *optional*, defaults to `self.image_grid_pinpoints`):
599
+ A list of possible resolutions to use for processing high resolution images. The best resolution is
600
+ selected based on the original size of the image.
601
+ resample (`int`, *optional*, defaults to `self.resample`):
602
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
603
+ has an effect if `do_resize` is set to `True`.
604
+ do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
605
+ Whether to center crop the image.
606
+ crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
607
+ Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
608
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
609
+ Whether to rescale the image.
610
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
611
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
612
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
613
+ Whether to normalize the image.
614
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
615
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
616
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
617
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
618
+ `True`.
619
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
620
+ Whether to convert the image to RGB.
621
+ return_tensors (`str` or `TensorType`, *optional*):
622
+ The type of tensors to return. Can be one of:
623
+ - Unset: Return a list of `np.ndarray`.
624
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
625
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
626
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
627
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
628
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
629
+ The channel dimension format for the output image. Can be one of:
630
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
631
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
632
+ - Unset: Use the channel dimension format of the input image.
633
+ input_data_format (`ChannelDimension` or `str`, *optional*):
634
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
635
+ from the input image. Can be one of:
636
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
637
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
638
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
639
+
640
+ """
641
+ do_resize = do_resize if do_resize is not None else self.do_resize
642
+ size = size if size is not None else self.size
643
+ size = get_size_dict(size, param_name="size", default_to_square=False)
644
+ image_grid_pinpoints = image_grid_pinpoints if image_grid_pinpoints is not None else self.image_grid_pinpoints
645
+ resample = resample if resample is not None else self.resample
646
+ do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
647
+ crop_size = crop_size if crop_size is not None else self.crop_size
648
+ crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
649
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
650
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
651
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
652
+ image_mean = image_mean if image_mean is not None else self.image_mean
653
+ image_std = image_std if image_std is not None else self.image_std
654
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
655
+
656
+ images = make_batched_images(images)
657
+
658
+ if not valid_images(images):
659
+ raise ValueError(
660
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
661
+ "torch.Tensor, tf.Tensor or jax.ndarray."
662
+ )
663
+
664
+ validate_preprocess_arguments(
665
+ do_rescale=do_rescale,
666
+ rescale_factor=rescale_factor,
667
+ do_normalize=do_normalize,
668
+ image_mean=image_mean,
669
+ image_std=image_std,
670
+ do_center_crop=do_center_crop,
671
+ crop_size=crop_size,
672
+ do_resize=do_resize,
673
+ size=size,
674
+ resample=resample,
675
+ )
676
+
677
+ if do_convert_rgb:
678
+ images = [convert_to_rgb(image) for image in images]
679
+
680
+ # All transformations expect numpy arrays.
681
+ images = [to_numpy_array(image) for image in images]
682
+
683
+ if is_scaled_image(images[0]) and do_rescale:
684
+ logger.warning_once(
685
+ "It looks like you are trying to rescale already rescaled images. If the input"
686
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
687
+ )
688
+
689
+ if input_data_format is None:
690
+ # We assume that all images have the same channel dimension format.
691
+ input_data_format = infer_channel_dimension_format(images[0])
692
+
693
+ new_images = []
694
+ image_sizes = [get_image_size(image, channel_dim=input_data_format) for image in images]
695
+ num_patches = []
696
+ for image in images:
697
+ # convert image into a list of patches
698
+ # we intentially use the same data format as the input data format
699
+ image_patches = self.get_image_patches(
700
+ image,
701
+ image_grid_pinpoints,
702
+ size=(size["shortest_edge"], size["shortest_edge"]),
703
+ patch_size=crop_size["height"],
704
+ resample=resample,
705
+ data_format=input_data_format,
706
+ input_data_format=input_data_format,
707
+ )
708
+
709
+ # preprocess patches
710
+ pixel_values = self._preprocess(
711
+ image_patches,
712
+ do_resize=do_resize,
713
+ size=size,
714
+ resample=resample,
715
+ do_center_crop=do_center_crop,
716
+ crop_size=crop_size,
717
+ do_rescale=do_rescale,
718
+ rescale_factor=rescale_factor,
719
+ do_normalize=do_normalize,
720
+ image_mean=image_mean,
721
+ image_std=image_std,
722
+ data_format=data_format,
723
+ input_data_format=input_data_format,
724
+ )
725
+ num_patches.append(len(pixel_values))
726
+ pixel_values = np.array(pixel_values)
727
+ new_images.append(pixel_values)
728
+ processed_images = self._pad_for_batching(new_images)
729
+
730
+ return BatchFeature(
731
+ #data={"pixel_values": new_images}, tensor_type=return_tensors
732
+ data={"pixel_values": processed_images, "num_patches":num_patches}, tensor_type=return_tensors
733
+ )
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00003-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faaf2890af5673988fe2ea5507af18d7d7c3d9402ffb74e142f81c96af2c82b7
3
+ size 4946773672
processing_omchat.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Union
2
+
3
+ from transformers import PreTrainedTokenizer
4
+ from typing import List, Tuple
5
+
6
+ from transformers.feature_extraction_utils import BatchFeature
7
+ from transformers.image_utils import ImageInput
8
+ from transformers.processing_utils import ProcessorMixin
9
+ from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
10
+ from transformers.utils import TensorType
11
+ import torch
12
+
13
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=-200, return_tensors=None):
14
+ if "<image_0>" in prompt:
15
+ image_token_pattern = re.compile(r"<image_(\d+)>")
16
+ prompt_chunks = re.split(r'<image_[0-9]+>',prompt)
17
+ # Identify all the image tags
18
+ image_tags = image_token_pattern.findall(prompt)
19
+
20
+ input_ids = []
21
+ for i, chunk in enumerate(prompt_chunks):
22
+ input_ids.extend(tokenizer(chunk).input_ids)
23
+ if i < len(image_tags):
24
+ #input_ids.append(-100 * (int(image_tags[i]) + 3))
25
+ input_ids.append(-200)
26
+ else:
27
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
28
+ def insert_separator(X, sep):
29
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
30
+
31
+ input_ids = []
32
+ offset = 0
33
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
34
+ offset = 1
35
+ input_ids.append(prompt_chunks[0][0])
36
+
37
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
38
+ input_ids.extend(x[offset:])
39
+ # Convert to tensor if required
40
+ if return_tensors is not None:
41
+ if return_tensors == 'pt':
42
+ return torch.tensor(input_ids, dtype=torch.long)
43
+ else:
44
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
45
+
46
+ return input_ids
47
+
48
+
49
+ def make_context(
50
+ tokenizer: PreTrainedTokenizer,
51
+ query: str,
52
+ history: List[Tuple[str, str]] = None,
53
+ system: str = "",
54
+ max_window_size: int = 6144,
55
+ chat_format: str = "chatml",
56
+ ):
57
+ if history is None:
58
+ history = []
59
+
60
+ if chat_format == "chatml":
61
+ im_start, im_end = "<|im_start|>", "<|im_end|>"
62
+ im_start_tokens = [151644]
63
+ im_end_tokens = [151645]
64
+ nl_tokens = tokenizer.encode("\n")
65
+
66
+ def _tokenize_str(role, content):
67
+ if "<image>" in content:
68
+ return f"{role}\n{content}", tokenizer.encode(
69
+ role
70
+ ) + nl_tokens + tokenizer_image_token(
71
+ content, tokenizer, -200
72
+ )
73
+ else:
74
+ return f"{role}\n{content}", tokenizer.encode(
75
+ role
76
+ ) + nl_tokens + tokenizer.encode(content)
77
+
78
+ def _tokenize_str2(role, content):
79
+ return f"{role}\n{content}", tokenizer.encode(
80
+ role,
81
+ ) + nl_tokens + tokenizer.encode(content)
82
+
83
+ system_text, system_tokens_part = _tokenize_str("system", system)
84
+ system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
85
+
86
+ raw_text = ""
87
+ context_tokens = []
88
+
89
+ for turn_query, turn_response in reversed(history):
90
+ query_text, query_tokens_part = _tokenize_str("user", turn_query)
91
+ query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
92
+ response_text, response_tokens_part = _tokenize_str(
93
+ "assistant", turn_response
94
+ )
95
+ response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
96
+
97
+ next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
98
+ prev_chat = (
99
+ f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
100
+ )
101
+
102
+ current_context_size = (
103
+ len(system_tokens) + len(next_context_tokens) + len(context_tokens)
104
+ )
105
+ if current_context_size < max_window_size:
106
+ context_tokens = next_context_tokens + context_tokens
107
+ raw_text = prev_chat + raw_text
108
+ else:
109
+ break
110
+
111
+ context_tokens = system_tokens + context_tokens
112
+ raw_text = f"{im_start}{system_text}{im_end}" + raw_text
113
+ context_tokens += (
114
+ nl_tokens
115
+ + im_start_tokens
116
+ + _tokenize_str("user", query)[1]
117
+ + im_end_tokens
118
+ + nl_tokens
119
+ + im_start_tokens
120
+ + tokenizer.encode("assistant")
121
+ + nl_tokens
122
+ )
123
+ raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
124
+
125
+ elif chat_format == "raw":
126
+ raw_text = query
127
+ context_tokens = tokenizer.encode(raw_text)
128
+ else:
129
+ raise NotImplementedError(f"Unknown chat format {chat_format!r}")
130
+
131
+ return raw_text, context_tokens
132
+
133
+ def split_tensor(A, B):
134
+ split_tensors = []
135
+ start_idx = 0
136
+
137
+ for i, size in enumerate(B.tolist()):
138
+ split_tensor = A[i, :size, :, :, :]
139
+ split_tensors.append(split_tensor) # Take the first element from the batch dimension
140
+
141
+ return split_tensors
142
+
143
+ class OmChatProcessor(ProcessorMixin):
144
+ r"""
145
+ Constructs a OmChat processor which wraps a OmChat image processor and a LLaMa tokenizer into a single processor.
146
+
147
+ [`OmChatProcessor`] offers all the functionalities of [`OmChatImageProcessor`] and [`LlamaTokenizerFast`]. See the
148
+ [`~OmChatProcessor.__call__`] and [`~OmChatProcessor.decode`] for more information.
149
+
150
+ Args:
151
+ image_processor ([`OmChatImageProcessor`], *optional*):
152
+ The image processor is a required input.
153
+ tokenizer ([`LlamaTokenizerFast`], *optional*):
154
+ The tokenizer is a required input.
155
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
156
+ in a chat into a tokenizable string.
157
+ """
158
+
159
+ attributes = ["image_processor", "tokenizer"]
160
+ valid_kwargs = ["chat_template"]
161
+ image_processor_class = "AutoImageProcessor"
162
+ tokenizer_class = "AutoTokenizer"
163
+
164
+ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
165
+ super().__init__(image_processor, tokenizer)
166
+
167
+ def __call__(
168
+ self,
169
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
170
+ system_prompt: str = "You are a helpful assistant.",
171
+ images: ImageInput = None,
172
+ padding: Union[bool, str, PaddingStrategy] = False,
173
+ truncation: Union[bool, str, TruncationStrategy] = None,
174
+ max_length: Optional[int] = None,
175
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
176
+ ) -> BatchFeature:
177
+ """
178
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
179
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
180
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
181
+ OmChatImageProcessor's [`~OmChatImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
182
+ of the above two methods for more information.
183
+
184
+ Args:
185
+ text (`str`, `List[str]`, `List[List[str]]`):
186
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
187
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
188
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
189
+ system_prompt ('str'):
190
+ the initial system prompt (i.e., You are a helpful assistant.)
191
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
192
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
193
+ tensor. Both channels-first and channels-last formats are supported.
194
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
195
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
196
+ index) among:
197
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
198
+ sequence if provided).
199
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
200
+ acceptable input length for the model if that argument is not provided.
201
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
202
+ lengths).
203
+ max_length (`int`, *optional*):
204
+ Maximum length of the returned list and optionally padding length (see above).
205
+ truncation (`bool`, *optional*):
206
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
207
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
208
+ If set, will return tensors of a particular framework. Acceptable values are:
209
+
210
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
211
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
212
+ - `'np'`: Return NumPy `np.ndarray` objects.
213
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
214
+
215
+ Returns:
216
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
217
+
218
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
219
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
220
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
221
+ `None`).
222
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
223
+ """
224
+ #system_prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
225
+ if images is not None:
226
+ image_inputs = self.image_processor(images, return_tensors=return_tensors)
227
+ new_images = []
228
+ new_texts = []
229
+ img = image_inputs["pixel_values"]
230
+ num_patches = image_inputs["num_patches"]
231
+ img = split_tensor(img, num_patches)
232
+ if len(img) == 1:
233
+ n = num_patches.tolist()[0]
234
+ inp, context_tokens = make_context(
235
+ self.tokenizer,
236
+ "<image>\n"+"\n".join(["patch:<image>"]*(n-1)) +"\n"+ text.replace("<image>", ""),
237
+ None,
238
+ system_prompt,
239
+ )
240
+
241
+ else:
242
+ texts = text.split("<image>")
243
+ final =texts[0]
244
+ for i, n in enumerate(num_patches.tolist()):
245
+ final+= "\n<image>\n"+"\n".join(["patch:<image>"]*(n-1))+"\n"
246
+ if i+1 < len(texts):
247
+ final += texts[i+1]
248
+ inp, context_tokens = make_context(self.tokenizer, final, None, system_prompt)
249
+ text_inputs = {"input_ids": torch.tensor([context_tokens])}
250
+ image_inputs = {"images":torch.cat(img, dim=0)}
251
+ return BatchFeature(data={**text_inputs, **image_inputs})
252
+ else:
253
+ image_inputs = {"images":None}
254
+ inp, context_tokens = make_context(
255
+ self.tokenizer,
256
+ text.replace("<image>", "").strip(),
257
+ None,
258
+ "You are a helpful assistant.",
259
+ )
260
+ text_inputs = {"input_ids": torch.tensor([context_tokens])}
261
+
262
+ return BatchFeature(data={**text_inputs})
263
+
264
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
265
+ def batch_decode(self, *args, **kwargs):
266
+ """
267
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
268
+ refer to the docstring of this method for more information.
269
+ """
270
+ return self.tokenizer.batch_decode(*args, **kwargs)
271
+
272
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
273
+ def decode(self, *args, **kwargs):
274
+ """
275
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
276
+ the docstring of this method for more information.
277
+ """
278
+ return self.tokenizer.decode(*args, **kwargs)
279
+
280
+ @property
281
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
282
+ def model_input_names(self):
283
+ tokenizer_input_names = self.tokenizer.model_input_names
284
+ image_processor_input_names = self.image_processor.model_input_names
285
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))