chrisc36 commited on
Commit
6c4eb53
·
verified ·
1 Parent(s): d4deff7

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. added_tokens.json +10 -0
  2. config.json +32 -0
  3. config_molmo.py +60 -0
  4. generation_config.json +4 -0
  5. image_preprocessing_molmo.py +569 -0
  6. merges.txt +0 -0
  7. model-00001-of-00083.safetensors +3 -0
  8. model-00002-of-00083.safetensors +3 -0
  9. model-00003-of-00083.safetensors +3 -0
  10. model-00004-of-00083.safetensors +3 -0
  11. model-00005-of-00083.safetensors +3 -0
  12. model-00006-of-00083.safetensors +3 -0
  13. model-00007-of-00083.safetensors +3 -0
  14. model-00008-of-00083.safetensors +3 -0
  15. model-00009-of-00083.safetensors +3 -0
  16. model-00010-of-00083.safetensors +3 -0
  17. model-00011-of-00083.safetensors +3 -0
  18. model-00012-of-00083.safetensors +3 -0
  19. model-00013-of-00083.safetensors +3 -0
  20. model-00014-of-00083.safetensors +3 -0
  21. model-00015-of-00083.safetensors +3 -0
  22. model-00016-of-00083.safetensors +3 -0
  23. model-00017-of-00083.safetensors +3 -0
  24. model-00018-of-00083.safetensors +3 -0
  25. model-00019-of-00083.safetensors +3 -0
  26. model-00020-of-00083.safetensors +3 -0
  27. model-00021-of-00083.safetensors +3 -0
  28. model-00022-of-00083.safetensors +3 -0
  29. model-00023-of-00083.safetensors +3 -0
  30. model-00024-of-00083.safetensors +3 -0
  31. model-00025-of-00083.safetensors +3 -0
  32. model-00026-of-00083.safetensors +3 -0
  33. model-00027-of-00083.safetensors +3 -0
  34. model-00028-of-00083.safetensors +3 -0
  35. model-00029-of-00083.safetensors +3 -0
  36. model-00030-of-00083.safetensors +3 -0
  37. model-00031-of-00083.safetensors +3 -0
  38. model-00032-of-00083.safetensors +3 -0
  39. model-00033-of-00083.safetensors +3 -0
  40. model-00034-of-00083.safetensors +3 -0
  41. model-00035-of-00083.safetensors +3 -0
  42. model-00036-of-00083.safetensors +3 -0
  43. model-00037-of-00083.safetensors +3 -0
  44. model-00038-of-00083.safetensors +3 -0
  45. model-00039-of-00083.safetensors +3 -0
  46. model-00040-of-00083.safetensors +3 -0
  47. model-00041-of-00083.safetensors +3 -0
  48. model-00042-of-00083.safetensors +3 -0
  49. model-00043-of-00083.safetensors +3 -0
  50. model-00044-of-00083.safetensors +3 -0
added_tokens.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<im_col>": 151649,
3
+ "<im_end>": 151647,
4
+ "<im_patch>": 151648,
5
+ "<im_start>": 151646,
6
+ "<|endoftext|>": 151643,
7
+ "<|im_end|>": 151645,
8
+ "<|im_start|>": 151644,
9
+ "<|image|>": 151650
10
+ }
config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "MolmoForCausalLM"
4
+ ],
5
+ "attention_layer_norm": false,
6
+ "auto_map": {
7
+ "AutoConfig": "config_molmo.MolmoConfig",
8
+ "AutoModelForCausalLM": "modeling_molmo.MolmoForCausalLM"
9
+ },
10
+ "clip_qkv": null,
11
+ "embedding_size": 152064,
12
+ "hidden_size": 8192,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 59136,
15
+ "layer_norm_eps": 1e-05,
16
+ "layer_norm_type": "rms",
17
+ "max_position_embeddings": 4096,
18
+ "model_type": "molmo",
19
+ "norm_after": false,
20
+ "num_attention_heads": 64,
21
+ "num_hidden_layers": 80,
22
+ "num_key_value_heads": 8,
23
+ "qkv_bias": true,
24
+ "rope_theta": 1000000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.43.3",
28
+ "use_cache": true,
29
+ "use_position_ids": true,
30
+ "vocab_size": 152064,
31
+ "weight_tying": false
32
+ }
config_molmo.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ from transformers import PretrainedConfig, AutoTokenizer
4
+
5
+
6
+ class MolmoConfig(PretrainedConfig):
7
+ model_type = "molmo"
8
+ keys_to_ignore_at_inference = ["past_key_values"]
9
+
10
+ def __init__(
11
+ self,
12
+ vocab_size=50304,
13
+ embedding_size=50304,
14
+ hidden_size=4096,
15
+ intermediate_size=11008,
16
+ num_hidden_layers=32,
17
+ num_attention_heads=32,
18
+ num_key_value_heads=None,
19
+ max_position_embeddings=2048,
20
+ initializer_range=0.02,
21
+ use_cache=True,
22
+ layer_norm_eps: float = 1e-5,
23
+ rope_theta=10000.0,
24
+ clip_qkv=None,
25
+ qkv_bias: bool = False,
26
+ weight_tying: bool = False,
27
+ use_position_ids: bool=True,
28
+ tie_word_embeddings: bool=True,
29
+ attention_layer_norm: bool=False,
30
+ norm_after: bool = False,
31
+ layer_norm_type: str="rms",
32
+ **kwargs,
33
+ ):
34
+ self.vocab_size = vocab_size
35
+ self.embedding_size = embedding_size
36
+ self.max_position_embeddings = max_position_embeddings
37
+ self.hidden_size = hidden_size
38
+ self.intermediate_size = intermediate_size
39
+ self.num_hidden_layers = num_hidden_layers
40
+ self.num_attention_heads = num_attention_heads
41
+ self.layer_norm_eps = layer_norm_eps
42
+ self.weight_tying = weight_tying
43
+ self.use_position_ids = use_position_ids
44
+ self.attention_layer_norm = attention_layer_norm
45
+ self.num_key_value_heads = num_key_value_heads
46
+ self.initializer_range = initializer_range
47
+ self.use_cache = use_cache
48
+ self.rope_theta = rope_theta
49
+ self.clip_qkv = clip_qkv
50
+ self.qkv_bias = qkv_bias
51
+ self.norm_after = norm_after
52
+ self.tie_word_embeddings = tie_word_embeddings
53
+ self.layer_norm_type = layer_norm_type
54
+
55
+ super().__init__(
56
+ tie_word_embeddings=tie_word_embeddings,
57
+ **kwargs,
58
+ )
59
+
60
+ MolmoConfig.register_for_auto_class()
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "transformers_version": "4.43.3"
4
+ }
image_preprocessing_molmo.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Image processor class for Molmo"""
2
+ from typing import List, Optional, Union, Mapping
3
+
4
+ import numpy as np
5
+ import einops
6
+ import torch
7
+ import torchvision.transforms
8
+ from torchvision.transforms import InterpolationMode
9
+ from torchvision.transforms.functional import convert_image_dtype
10
+
11
+ from transformers.image_utils import (
12
+ OPENAI_CLIP_MEAN,
13
+ OPENAI_CLIP_STD,
14
+ ImageInput,
15
+ is_valid_image,
16
+ )
17
+ from transformers.processing_utils import ImagesKwargs
18
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
19
+ from transformers.utils import TensorType, is_vision_available, logging
20
+
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+
25
+ def make_batched_images(images) -> List[List[ImageInput]]:
26
+ """
27
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
28
+
29
+ Args:
30
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
31
+ The input image.
32
+
33
+ Returns:
34
+ list: A list of images.
35
+ """
36
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
37
+ return [img for img_list in images for img in img_list]
38
+
39
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
40
+ return images
41
+
42
+ elif is_valid_image(images):
43
+ return [images]
44
+
45
+ raise ValueError(f"Could not make batched images from {images}")
46
+
47
+
48
+ def pad_to_bounding_box(
49
+ image, offset_height, offset_width, target_height,
50
+ target_width, value=0
51
+ ):
52
+ height, width = image.shape[:2]
53
+ after_padding_width = target_width - offset_width - width
54
+ after_padding_height = target_height - offset_height - height
55
+ return np.pad(image, [
56
+ [offset_height, after_padding_height],
57
+ [offset_width, after_padding_width],
58
+ [0, 0]
59
+ ], constant_values=value)
60
+
61
+
62
+ def normalize_image(image, offset, scale):
63
+ image -= np.array(offset, dtype=np.float32)[None, None, :]
64
+ image /= np.array(scale, dtype=np.float32)[None, None, :]
65
+ return image
66
+
67
+
68
+ def resize_and_pad(
69
+ image,
70
+ desired_output_size,
71
+ resize_method=InterpolationMode.BILINEAR,
72
+ pad_value=0,
73
+ normalize=True,
74
+ image_mean=OPENAI_CLIP_MEAN,
75
+ image_std=OPENAI_CLIP_STD,
76
+ ):
77
+ desired_height, desired_width = desired_output_size
78
+ height, width = image.shape[:2]
79
+
80
+ # Cast into float32 since the training code did this in float32 and it (very rarely) effects
81
+ # the results after rounding.
82
+ image_scale_y = np.array(desired_height, np.float32) / np.array(height, np.float32)
83
+ image_scale_x = np.array(desired_width, np.float32) / np.array(width, np.float32)
84
+ image_scale = min(image_scale_x, image_scale_y)
85
+ scaled_height = int(np.array(height, np.float32) * image_scale)
86
+ scaled_width = int(np.array(width, np.float32) * image_scale)
87
+
88
+ # if resize_method == "tensorflow":
89
+ # FIXME remove
90
+ import tensorflow as tf
91
+ image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
92
+ image = tf.image.resize(
93
+ image,
94
+ [scaled_height, scaled_width],
95
+ method=tf.image.ResizeMethod.BILINEAR,
96
+ antialias=True,
97
+ )
98
+ image = tf.clip_by_value(image, 0.0, 1.0)
99
+ image = image.numpy()
100
+ # else:
101
+ # image = torch.permute(torch.from_numpy(image), [2, 0, 1])
102
+ # image = convert_image_dtype(image) # resize in flaot32
103
+ # image = torchvision.transforms.Resize(
104
+ # [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
105
+ # )(image)
106
+ # image = torch.clip(image, 0.0, 1.0)
107
+ # image = torch.permute(image, [1, 2, 0]).numpy()
108
+
109
+ top_pad = (desired_height - scaled_height) // 2
110
+ left_pad = (desired_width - scaled_width) // 2
111
+ padding = [
112
+ [top_pad, desired_height - scaled_height - top_pad],
113
+ [left_pad, desired_width - scaled_width - left_pad],
114
+ [0, 0]
115
+ ]
116
+ image_mask = np.pad(np.ones_like(image[:, :, 0], dtype=bool), padding[:2])
117
+ image = np.pad(image, padding, constant_values=pad_value)
118
+ if normalize:
119
+ image = normalize_image(image, offset=image_mean, scale=image_std)
120
+ return image, image_mask
121
+
122
+
123
+ def select_tiling(h, w, patch_size, max_num_patches):
124
+ """Decide how best to divide in image of size [w, h] in up to max_num_patches of size patch_size"""
125
+ original_size = np.stack([h, w]) # [1, 2]
126
+ original_res = h * w
127
+ tilings = []
128
+ for i in range(1, max_num_patches+1):
129
+ for j in range(1, max_num_patches+1):
130
+ if i*j <= max_num_patches:
131
+ tilings.append((i, j))
132
+ # sort so argmin and argmax favour smaller tilings in the event of a tie
133
+ tilings.sort(key=lambda x: (x[0]*x[1], x[0]))
134
+ candidate_tilings = np.array(tilings, dtype=np.int32) # [n_resolutions, 2]
135
+ candidate_resolutions = candidate_tilings * patch_size # [n_resolutions, 2]
136
+
137
+ # How much we would need to scale the image to fit exactly in each tiling
138
+ original_size = np.stack([h, w], dtype=np.float32) # [1, 2]
139
+ required_scale_d = candidate_resolutions.astype(np.float32) / original_size
140
+ required_scale = np.min(required_scale_d, axis=-1, keepdims=True) # [n_resolutions, 1]
141
+ if np.all(required_scale < 1):
142
+ # We are forced to downscale, so try to minimize the amount of downscaling
143
+ ix = np.argmax(required_scale)
144
+ else:
145
+ # Pick the resolution that required the least upscaling so that it most closely fits the image
146
+ required_scale = np.where(required_scale < 1.0, 10e9, required_scale)
147
+ ix = np.argmin(required_scale)
148
+ return candidate_tilings[ix]
149
+
150
+
151
+ class MolmoImagesKwargs(ImagesKwargs, total=False):
152
+ max_crops: Optional[int]
153
+ overlap_margins: Optional[List[int]]
154
+ base_image_input_size: Optional[List[int]]
155
+ image_token_length_w: Optional[int]
156
+ image_token_length_h: Optional[int]
157
+ image_patch_size: Optional[int]
158
+ image_padding_mask: Optional[bool]
159
+
160
+
161
+ class MolmoImageProcessor(BaseImageProcessor):
162
+ """Preprocess images and multi-model inputs"""
163
+
164
+ def __init__(
165
+ self,
166
+ max_crops: int = 12,
167
+ overlap_margins: List[int] = (4, 4),
168
+ base_image_input_size: List[int] = (336, 336),
169
+ image_token_length_w: int = 12,
170
+ image_token_length_h: int = 12,
171
+ image_patch_size: int = 14,
172
+ image_padding_mask: bool = True,
173
+ do_normalize: bool = True,
174
+ image_mean: Optional[Union[float, List[float]]] = None,
175
+ image_std: Optional[Union[float, List[float]]] = None,
176
+ **kwargs,
177
+ ):
178
+ super().__init__(**kwargs)
179
+ self.max_crops = max_crops
180
+ self.overlap_margins = overlap_margins
181
+ self.base_image_input_size = base_image_input_size
182
+ self.image_token_length_w = image_token_length_w
183
+ self.image_token_length_h = image_token_length_h
184
+ self.image_patch_size = image_patch_size
185
+ self.image_padding_mask = image_padding_mask
186
+ self.do_normalize = do_normalize
187
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
188
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
189
+
190
+ def image_to_patches_and_tokens(
191
+ self,
192
+ image: ImageInput,
193
+ image_patch_token_id: int,
194
+ image_col_token_id: int,
195
+ image_start_token_id: int,
196
+ image_end_token_id: int,
197
+ max_crops: Optional[int] = None,
198
+ overlap_margins: Optional[List[int]] = None,
199
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
200
+ image_token_length_w: Optional[int] = None,
201
+ image_token_length_h: Optional[int] = None,
202
+ image_patch_size: Optional[int] = None,
203
+ ):
204
+ """Preprocesses an image
205
+
206
+ Returns:
207
+ crops: (n_crops, n_patches, patch_dim) individual crops, `n_crops` might
208
+ change between images but the other dimension are fixed
209
+ tokens: (n_tokens,) int32 tokens, pad tokens indicating where to insert the
210
+ patch features, might include other special tokens as well
211
+ patch_ordering: (n_crops, n_tokens_per_crop) order image features should be inserted
212
+ into the `tokens`, negative values indicates patches features to exclude
213
+ padding_mask: (n_crops, n_patches) what percent of each crop is padding, be None
214
+ if the image mask is not being used.
215
+ """
216
+ if isinstance(base_image_input_size, int):
217
+ base_image_input_size = (base_image_input_size, base_image_input_size)
218
+
219
+ base_image_input_d = image_patch_size
220
+ tokens_per_image = image_token_length_w * image_token_length_h
221
+ image_base_patch_w = base_image_input_size[1] // base_image_input_d
222
+ image_base_patch_h = base_image_input_size[0] // base_image_input_d
223
+
224
+ original_image_h, original_image_w = image.shape[:2]
225
+ crop_size = base_image_input_size[0]
226
+
227
+ # Discard this many patches from the (left/top, right/bottom) of crops
228
+ left_margin, right_margin = overlap_margins
229
+ # left_margin, right_margin = 2, 2
230
+ assert left_margin % 2 == 0 # Required for compatibility with 2x2 pooling
231
+ total_margin_pixels = base_image_input_d*(right_margin + left_margin) # pixels removed per dim
232
+ crop_patches = base_image_input_size[0] // base_image_input_d # patches per crop dim
233
+ crop_window_patches = crop_patches - (right_margin + left_margin) # usable patches
234
+ crop_window_size = crop_window_patches * base_image_input_d
235
+ tiling = select_tiling(
236
+ original_image_h - total_margin_pixels,
237
+ original_image_w - total_margin_pixels,
238
+ crop_window_size,
239
+ max_crops
240
+ )
241
+ src, img_mask = resize_and_pad(
242
+ image,
243
+ [tiling[0]*crop_window_size+total_margin_pixels, tiling[1]*crop_window_size+total_margin_pixels]
244
+ )
245
+
246
+ # Now we have to split the image into crops, while keeping track of how each patch in the
247
+ # each crop should be ordered in the global image, this require a lot of tricky booking
248
+ n_crops = tiling[0] * tiling[1]
249
+ patches_arr = []
250
+ mask_arr = []
251
+ patch_ordering_arr = []
252
+
253
+ # We assume 2x2 pooling, but can allow padding the right/bottom with extra
254
+ # patches if the number of patches per side is not even
255
+ assert (crop_patches+1)//2 == image_token_length_h
256
+ assert (crop_patches+1)//2 == image_token_length_w
257
+ on = 0
258
+ on_patch = 0
259
+ for i in range(tiling[0]):
260
+ y0 = i*crop_window_size
261
+ if i == 0:
262
+ crop_y0 = 0
263
+ else:
264
+ crop_y0 = left_margin // 2
265
+
266
+ crop_h = image_base_patch_h - (right_margin + left_margin)
267
+ if i == 0:
268
+ crop_h += left_margin
269
+ if i == (tiling[0]-1):
270
+ crop_h += right_margin
271
+ for j in range(tiling[1]):
272
+ x0 = j*crop_window_size
273
+ if j == 0:
274
+ crop_x0 = 0
275
+ else:
276
+ crop_x0 = left_margin // 2
277
+
278
+ crop_w = image_base_patch_w - (right_margin + left_margin)
279
+ if j == 0:
280
+ crop_w += left_margin
281
+ if j == (tiling[1]-1):
282
+ crop_w += right_margin
283
+
284
+ pooled_w = (crop_w + 1) // 2
285
+ pooled_h = (crop_h + 1) // 2
286
+ patch_ordering_arr.append(
287
+ pad_to_bounding_box(
288
+ np.reshape(np.arange(on, on+pooled_h*pooled_w, dtype=np.int32), (pooled_h, pooled_w, 1)),
289
+ crop_y0, crop_x0, image_token_length_h, image_token_length_w, value=-1
290
+ )[:, :, 0]
291
+ )
292
+ patches_arr.append(src[y0:y0+crop_size, x0:x0+crop_size])
293
+ mask_arr.append(img_mask[y0:y0+crop_size, x0:x0+crop_size])
294
+
295
+ on += pooled_h*pooled_w
296
+ on_patch += 1
297
+ patches = np.stack(patches_arr)
298
+ patch_ordering = np.stack(patch_ordering_arr)
299
+ img_mask = np.stack(mask_arr)
300
+
301
+ # Switch to [n_crops, n_patches, pixels_per_patch] format
302
+ image_layout_impatch_w, image_layout_impatch_h = tiling[0], tiling[1]
303
+ patches = einops.rearrange(
304
+ patches, 'p (h dh) (w dw) c -> p (h w) (dh dw c)',
305
+ dh=base_image_input_d,
306
+ dw=base_image_input_d,
307
+ h=image_base_patch_h,
308
+ w=image_base_patch_w
309
+ )
310
+ img_mask = einops.rearrange(
311
+ img_mask, 'p (h dh) (w dw) -> p (h w) (dh dw)',
312
+ dh=base_image_input_d,
313
+ dw=base_image_input_d,
314
+ h=image_base_patch_h,
315
+ w=image_base_patch_w
316
+ )
317
+
318
+ img_mask = img_mask.astype(np.float32).mean(axis=-1)
319
+ patch_ordering = np.reshape(patch_ordering, [-1])
320
+ valid = patch_ordering >= 0
321
+
322
+ # Transpose order, to get left-to-right order instead of crop-by-crop order
323
+ patch_ordering_rh = np.reshape(
324
+ patch_ordering,
325
+ [tiling[0], tiling[1], image_token_length_h, image_token_length_w]
326
+ )
327
+ patch_ordering_rh = np.transpose(patch_ordering_rh, [0, 2, 1, 3])
328
+ patch_ordering_rh = np.reshape(patch_ordering_rh, [-1])
329
+
330
+ # The transpose will screw up which patches are masked, project the
331
+ # new order into sparse structure of `patch_ordering` to fix this
332
+ patch_ordering[valid] = patch_ordering_rh[patch_ordering_rh >= 0]
333
+
334
+ # Now build the output tokens
335
+ h = tiling[0] * crop_window_patches + (right_margin+left_margin)
336
+ w = tiling[1] * crop_window_patches + (right_margin+left_margin)
337
+ per_row = np.full(
338
+ ((w+1)//2,),
339
+ image_patch_token_id,
340
+ )
341
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
342
+
343
+ joint = np.tile(per_row, [(h+1)//2])
344
+ joint = [
345
+ [image_start_token_id],
346
+ joint,
347
+ [image_end_token_id]
348
+ ]
349
+
350
+ # Finally do the same for the global image
351
+ resized, _ = resize_and_pad(image, base_image_input_size)
352
+ resized = einops.rearrange(
353
+ resized, '(h dh) (w dw) c -> (h w) (dh dw c)',
354
+ dh=base_image_input_d,
355
+ dw=base_image_input_d,
356
+ h=image_base_patch_h,
357
+ w=image_base_patch_w
358
+ )
359
+ patches = np.concatenate([np.expand_dims(resized, 0), patches], 0)
360
+
361
+ # Global image goes first, so the order of patches in previous crops gets increased
362
+ patch_ordering = np.where(
363
+ patch_ordering >= 0,
364
+ patch_ordering + tokens_per_image,
365
+ -1
366
+ )
367
+ patch_ordering = np.concatenate([np.arange(0, tokens_per_image), patch_ordering], 0)
368
+ per_row = np.full(
369
+ (image_token_length_w,),
370
+ image_patch_token_id,
371
+ )
372
+ per_row = np.concatenate([per_row, [image_col_token_id]], 0)
373
+ extra_tokens = np.tile(per_row, [image_token_length_h])
374
+ joint = [
375
+ [image_start_token_id],
376
+ extra_tokens,
377
+ [image_end_token_id],
378
+ ] + joint
379
+
380
+ joint = np.concatenate(joint, 0)
381
+ img_mask = np.pad(img_mask, [[0, 1], [0, 0]], constant_values=-1)
382
+ return patches, joint, patch_ordering, img_mask
383
+
384
+ def build_image_input_idx(
385
+ self,
386
+ image_tokens: np.ndarray,
387
+ patch_order: np.ndarray,
388
+ image_patch_token_id: int,
389
+ no_image: Optional[bool] = None,
390
+ image_token_length_w: Optional[int] = None,
391
+ image_token_length_h: Optional[int] = None,
392
+ ):
393
+ """Converts `patch_order` into a mapping of token_id -> patch_id"""
394
+
395
+ tokens_per_image = image_token_length_w * image_token_length_h
396
+ if no_image is not None and no_image:
397
+ return np.zeros((0, tokens_per_image), np.int32)
398
+
399
+ # Indices to insert the patches
400
+ image_input_idx = image_tokens == image_patch_token_id
401
+ image_input_idx = np.nonzero(image_input_idx)[0].astype(np.int32)
402
+
403
+ if patch_order is not None:
404
+ n_tokens = image_input_idx.shape[0]
405
+ patch_order = np.reshape(patch_order, [-1])
406
+ n_patches = patch_order.shape[0]
407
+
408
+ valid = patch_order >= 0
409
+ n_valid_patches = valid.sum()
410
+ assert len(image_input_idx) == n_valid_patches
411
+
412
+ sorted_patch_ixs = np.zeros([n_tokens], np.int32)
413
+ sorted_patch_ixs[patch_order[valid]] = np.arange(n_valid_patches, dtype=np.int32)
414
+
415
+ # Project the inverted mapping into same sparse structure
416
+ sorted_patch_ixs_ex = np.full(np.shape(patch_order), -1)
417
+ sorted_patch_ixs_ex[valid] = sorted_patch_ixs
418
+
419
+ # Do the gather and then re-masked outputs that were masked in `sorted_patch_ixs`
420
+ valid = (sorted_patch_ixs_ex >= 0).astype(np.int32)
421
+ image_input_idx = image_input_idx[sorted_patch_ixs_ex*valid]
422
+ image_input_idx = image_input_idx*valid - 100*(1 - valid)
423
+ image_input_idx = np.reshape(image_input_idx, [-1, tokens_per_image])
424
+ return image_input_idx
425
+
426
+ def preprocess(
427
+ self,
428
+ image: np.ndarray,
429
+ image_patch_token_id: int,
430
+ image_col_token_id: int,
431
+ image_start_token_id: int,
432
+ image_end_token_id: int,
433
+ max_crops: Optional[int] = None,
434
+ overlap_margins: Optional[List[int]] = None,
435
+ base_image_input_size: Optional[Union[int, List[int]]] = None,
436
+ image_token_length_w: Optional[int] = None,
437
+ image_token_length_h: Optional[int] = None,
438
+ image_patch_size: Optional[int] = None,
439
+ **kwargs,
440
+ ):
441
+ """Preprocesses a single image"""
442
+
443
+ max_crops = max_crops or self.max_crops
444
+ overlap_margins = overlap_margins or self.overlap_margins
445
+ base_image_input_size = base_image_input_size or self.base_image_input_size
446
+ image_token_length_w = image_token_length_w or self.image_token_length_w
447
+ image_token_length_h = image_token_length_h or self.image_token_length_h
448
+ image_patch_size = image_patch_size or self.image_patch_size
449
+
450
+ crops, image_tokens, patch_ordering, img_mask = self.image_to_patches_and_tokens(
451
+ image,
452
+ image_patch_token_id,
453
+ image_col_token_id,
454
+ image_start_token_id,
455
+ image_end_token_id,
456
+ max_crops,
457
+ overlap_margins,
458
+ base_image_input_size,
459
+ image_token_length_w,
460
+ image_token_length_h,
461
+ image_patch_size,
462
+ )
463
+ patch_idx = self.build_image_input_idx(
464
+ image_tokens,
465
+ patch_ordering,
466
+ image_patch_token_id,
467
+ image_token_length_w=image_token_length_w,
468
+ image_token_length_h=image_token_length_h,
469
+ )
470
+ return crops, image_tokens, patch_idx, img_mask
471
+
472
+ def multimodal_preprocess(
473
+ self,
474
+ images: np.ndarray,
475
+ tokens: List[int],
476
+ image_idx: np.ndarray,
477
+ sequence_length: int,
478
+ image_patch_token_id: int,
479
+ image_col_token_id: int,
480
+ image_start_token_id: int,
481
+ image_end_token_id: int,
482
+ **kwargs,
483
+ ):
484
+ """Merge images and text tokens into multi-modal features for the model
485
+
486
+ :param images: images to use as input
487
+ :param tokens: input text tokens
488
+ :param image_idx: where to insert the images into `tokens`
489
+ :params image_patch_token_id: id to use of tokens that will contain image features
490
+ :params image_col_token_id: token id for image column special tokens
491
+ :params image_start_token_id: token id for image start special tokens
492
+ :params image_end_token_id: token id for image end special tokens
493
+ :params kwargs: override preprocessor default args
494
+ """
495
+ max_total_crops = kwargs.get("max_crops") or self.max_crops
496
+ image_token_length_w = kwargs.get("image_token_length_w") or self.image_token_length_w
497
+ image_token_length_h = kwargs.get("image_token_length_h") or self.image_token_length_h
498
+ image_patch_size = kwargs.get("image_patch_size") or self.image_patch_size
499
+ base_image_input_size = kwargs.get("base_image_input_size") or self.base_image_input_size
500
+ image_num_patch = (
501
+ base_image_input_size[0] // image_patch_size,
502
+ base_image_input_size[1] // image_patch_size,
503
+ )
504
+ image_padding_mask = kwargs.get("image_padding_mask") or self.image_padding_mask
505
+
506
+ tokens_per_image = image_token_length_w * image_token_length_h
507
+ n_pixels = image_patch_size * image_patch_size * 3
508
+ n_patches = image_num_patch[0] * image_num_patch[1]
509
+
510
+ if images is None:
511
+ return {
512
+ "input_ids": tokens,
513
+ "images": None,
514
+ "image_input_idx": None
515
+ }
516
+ else:
517
+ n = len(images)
518
+ all_crops = []
519
+ all_image_idx = []
520
+ out_tokens = []
521
+ all_crop_masks = []
522
+
523
+ for ix in range(n):
524
+ token_ix = image_idx[ix]
525
+ crops, image_tokens, patch_idx, img_mask = self.preprocess(
526
+ images[ix],
527
+ image_patch_token_id,
528
+ image_col_token_id,
529
+ image_start_token_id,
530
+ image_end_token_id,
531
+ **kwargs,
532
+ )
533
+
534
+ if token_ix == -1: # -1 is an image inserted at the very start
535
+ start = 0
536
+ token_ix = 0
537
+ end = 0
538
+ else:
539
+ start = 0 if ix == 0 else image_idx[ix-1] + 1
540
+ end = token_ix + 1
541
+
542
+ all_image_idx.append(patch_idx + token_ix)
543
+ all_crops.append(crops)
544
+ out_tokens.append(tokens[start:token_ix])
545
+ out_tokens.append(image_tokens)
546
+ if ix == (n - 1):
547
+ out_tokens.append(tokens[end:])
548
+ if image_padding_mask:
549
+ all_crop_masks.append(img_mask)
550
+
551
+ input_ids = np.concatenate(out_tokens, 0)
552
+ images = np.concatenate(all_crops, 0)
553
+ image_input_idx = np.concatenate(all_image_idx, 0)
554
+ if image_padding_mask:
555
+ image_masks = np.concatenate(all_crop_masks, 0)
556
+ else:
557
+ image_masks = None
558
+
559
+ out = {
560
+ "input_ids": input_ids,
561
+ "images": images,
562
+ "image_input_idx": image_input_idx
563
+ }
564
+ if image_masks is not None:
565
+ out["image_masks"] = image_masks
566
+ return out
567
+
568
+
569
+ MolmoImageProcessor.register_for_auto_class()
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5027bfc3649b4bb4fb35e5be030dcfeb84f0d38880e56454d818092f4989e339
3
+ size 4987060576
model-00002-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1cc3aa5377ed4604299078f9106c228d7e08da1cb173b680274eee7cd440dae
3
+ size 4748125472
model-00003-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2e5c5b0cf708950900cddde59fa3b56b0f5f89cd307f96c7634a3d810652f5a
3
+ size 3846325304
model-00004-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:439ac00ade686bf9af4aac0c03d41da02e4c93a56769fe6ffa48472bf6c94368
3
+ size 3510739800
model-00005-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89861d9233c358a6db60efab583c77784ebdee4d4e30e860f3f55bcae4c9cdc2
3
+ size 3510739800
model-00006-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:494de8dd6ff50adeac7e92a53ab057e04c19748beee53601345731814b830240
3
+ size 3510739800
model-00007-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e196c810473673bc7ccc015970ae72a45a2ed43e364e691d40105bd088b1c240
3
+ size 3510739800
model-00008-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ef4081b8fbf67b7a5ec34ed9167e4cd8aa0950cb039300a1c9caf5adc56d2aa
3
+ size 3510739800
model-00009-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cbda5d6ff2afd461826b7feab07dbfea21cf8f8a9ce122c61ae73c5f51edaac
3
+ size 3510739800
model-00010-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0469e47cb78e5685fcd90d7ab8871311ea7c8eaadd3564fb7d0498db0eeb5090
3
+ size 3510739800
model-00011-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc0aea52fdc716d72e6ae9457a21788324eeb9581984409d4531ad4b26ea55e
3
+ size 3510739792
model-00012-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda2bf30ef4903b8ca8cc03ed917a93ac25bc981496a583578328fd7f899e736
3
+ size 3510739808
model-00013-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bb1a0125fcc9ed788d9251a02867a9900d411643839fdff4a1cbdc5291545e1
3
+ size 3510739808
model-00014-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dfd129c7c4310b2dfa2586552b927b95eb28c5b8de84775a744a8be870f19dd
3
+ size 3510739808
model-00015-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0046b0763b8cf9cd100ee51fb8440a8b90174764e1e620e6dc44f8f1bb60956e
3
+ size 3510739808
model-00016-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca6205a2ba7dd0a0f4648f17ba83345a747ac00087308ff8994b5c408b8d42b9
3
+ size 3510739808
model-00017-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:476c85010940c0dcaf22b91b9e354deb44aea785e1c24d647fe8683682070762
3
+ size 3510739808
model-00018-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c50af8499746df17e7f67ceec754313fad39389de2fc0c2e7165f2f288a9c1e3
3
+ size 3510739808
model-00019-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:193de920a1d37b44c478bdef9b10bd2da721a1efa0550d530de05abe6e05ccbf
3
+ size 3510739808
model-00020-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2448617f3927b8a4e75f9953c1dbd398e68282e662b4fe03cc3430dfcb14df
3
+ size 3510739808
model-00021-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81736cc90cfc2d5f96786eb864ba5c9499a54256209a2dbaab4bc53bba2ca8f7
3
+ size 3510739808
model-00022-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:121d0a02ee908944e8477f89b61972387224c6944e5ff8d0d89dca00fb15e4df
3
+ size 3510739808
model-00023-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba960e96b20901a97d8ab4dd63577c57b4115836f4faf46cfd9e9ef01186f7dd
3
+ size 3510739808
model-00024-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba41eb0b90b1f5ae7f72e88e32429bd2a3f8ff8073d99efbf2c048cf66b9ad78
3
+ size 3510739808
model-00025-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad847636f627f05d37ab7c96bf8b39b37da677d290ec758425e9f84aab2420ee
3
+ size 3510739808
model-00026-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239e7422ffeea9561dabd94be05cbc99489219f556ccce468caa9fc601682fe9
3
+ size 3510739808
model-00027-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e85365a207fe405f7062b12389c8f26a0e9a9cb0be3bbb039b2ecdadb18cb00c
3
+ size 3510739808
model-00028-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ea35ccd0a85cf2e3ef4f28f355e8fabbf1a7ff20a5dd8437549803841758961
3
+ size 3510739808
model-00029-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44a12e4589f4c20a4065fabad5c8fe3ddc0e373b640b4a869d17f595fbf6aee
3
+ size 3510739808
model-00030-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ac210af50eddf60604c7a8c793bcfcb3b26144c42231fd8485ccb1a3a6bf444
3
+ size 3510739808
model-00031-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b61c5215ac70a00a50c0e3aa3a2efe635683a10cee509983b194cdd5434bb2
3
+ size 3510739808
model-00032-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a3ff0e6a48b91a6c36de5568cec3975530377df14b275f463cd33e1f5f600e
3
+ size 3510739808
model-00033-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e0950d7c66ac9fe3fdbe5a6fbb8af46ae30d5da0ab75fb37cce25ed18849a77
3
+ size 3510739808
model-00034-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b35d01cd2551ea952a66f0dc99994da13b0585991914def92744f49d079471b0
3
+ size 3510739808
model-00035-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbc5e17860c5cb786b97e843e61489c806ba4b3f81805efd517272c4cdef4912
3
+ size 3510739808
model-00036-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04e473ed195a967e28ce494561ae17bc1b0b1b8ba9bce8e916a000477f294d1f
3
+ size 3510739808
model-00037-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9cb9dbeb0d9b9fb1414421c40fb77887ae2c5a72ae2114538322b5468340d44
3
+ size 3510739808
model-00038-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f04c5bcdce77116fa013fe8bec74149986c7e181b831d5cbf2336746ed5f789
3
+ size 3510739808
model-00039-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d494a0cdb3d81d12b57452317fcb6c22e2f27d4d1e5d1d57d14e212461ec6d85
3
+ size 3510739808
model-00040-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b93a4a502934019d5c9854f89ed555067975262d22fce5622ca320f1581ee498
3
+ size 3510739808
model-00041-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d01aab48a7a5fcf935597a460bc6683f1865c56495feded7ee2d8d005a39af80
3
+ size 3510739808
model-00042-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a252102c2608a674a04ec7592777256506508a31a03024864177437c4e59963
3
+ size 3510739808
model-00043-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011433574b832a224274bffbe56a2fdb62f3e47f81aff71b8fb14b35a7a15d1e
3
+ size 3510739808
model-00044-of-00083.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f3a810ade5fc94a14c7c7e21a85722a5a2da4c1fe1bb36d7b7a79a73a30e98d
3
+ size 3510739808