VictorSanh commited on
Commit
01aa456
·
1 Parent(s): b1b2476

Delete unecessary files

Browse files
image_processing_idefics.py DELETED
@@ -1,168 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Image processor class for Idefics."""
16
-
17
- from typing import Callable, Dict, List, Optional, Union
18
-
19
- from PIL import Image
20
-
21
- from ...image_processing_utils import BaseImageProcessor, BatchFeature
22
- from ...image_transforms import resize, to_channel_dimension_format
23
- from ...image_utils import (
24
- ChannelDimension,
25
- ImageInput,
26
- PILImageResampling,
27
- make_list_of_images,
28
- to_numpy_array,
29
- valid_images,
30
- )
31
- from ...utils import TensorType, is_torch_available
32
-
33
-
34
- IDEFICS_STANDARD_MEAN = [0.48145466, 0.4578275, 0.40821073]
35
- IDEFICS_STANDARD_STD = [0.26862954, 0.26130258, 0.27577711]
36
-
37
-
38
- def convert_to_rgb(image):
39
- # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
40
- # for transparent images. The call to `alpha_composite` handles this case
41
- if image.mode == "RGB":
42
- return image
43
-
44
- image_rgba = image.convert("RGBA")
45
- background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
46
- alpha_composite = Image.alpha_composite(background, image_rgba)
47
- alpha_composite = alpha_composite.convert("RGB")
48
- return alpha_composite
49
-
50
-
51
- class IdeficsImageProcessor(BaseImageProcessor):
52
- r"""
53
- Constructs a Idefics image processor.
54
-
55
- Args:
56
- image_size (`int`, *optional*, defaults to 224):
57
- Resize to image size
58
- image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
59
- Mean to use if normalizing the image. This is a float or list of floats the length of the number of
60
- channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
61
- overridden by the `image_mean` parameter in the `preprocess` method.
62
- image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
63
- Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
64
- number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
65
- Can be overridden by the `image_std` parameter in the `preprocess` method.
66
- image_num_channels (`int`, *optional*, defaults to 3):
67
- Number of image channels.
68
- """
69
-
70
- model_input_names = ["pixel_values"]
71
-
72
- def __init__(
73
- self,
74
- image_size: int = 224,
75
- image_mean: Optional[Union[float, List[float]]] = None,
76
- image_std: Optional[Union[float, List[float]]] = None,
77
- image_num_channels: Optional[int] = 3,
78
- **kwargs,
79
- ) -> None:
80
- super().__init__(**kwargs)
81
-
82
- self.image_size = image_size
83
- self.image_num_channels = image_num_channels
84
- self.image_mean = image_mean
85
- self.image_std = image_std
86
-
87
- def preprocess(
88
- self,
89
- images: ImageInput,
90
- image_num_channels: Optional[int] = 3,
91
- image_size: Optional[Dict[str, int]] = None,
92
- image_mean: Optional[Union[float, List[float]]] = None,
93
- image_std: Optional[Union[float, List[float]]] = None,
94
- transform: Callable = None,
95
- **kwargs,
96
- ) -> TensorType.PYTORCH:
97
- """
98
- Preprocess a batch of images.
99
-
100
- Args:
101
- images (`ImageInput`):
102
- A list of images to preprocess.
103
- image_size (`int`, *optional*, defaults to `self.image_size`):
104
- Resize to image size
105
- image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
106
- Number of image channels.
107
- image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
108
- Mean to use if normalizing the image. This is a float or list of floats the length of the number of
109
- channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
110
- be overridden by the `image_mean` parameter in the `preprocess` method.
111
- image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
112
- Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
113
- number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
114
- method. Can be overridden by the `image_std` parameter in the `preprocess` method.
115
- transform (`Callable`, *optional*, defaults to `None`):
116
- A custom transform function that accepts a single image can be passed for training. For example,
117
- `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
118
- assumed - and then a preset of inference-specific transforms will be applied to the images
119
-
120
- Returns:
121
- a PyTorch tensor of the processed images
122
-
123
- """
124
- image_size = image_size if image_size is not None else self.image_size
125
- image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
126
- image_mean = image_mean if image_mean is not None else self.image_mean
127
- image_std = image_std if image_std is not None else self.image_std
128
- size = (image_size, image_size)
129
-
130
- if isinstance(images, list) and len(images) == 0:
131
- return []
132
-
133
- images = make_list_of_images(images)
134
-
135
- if not valid_images(images):
136
- raise ValueError(
137
- "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
138
- "torch.Tensor, tf.Tensor or jax.ndarray."
139
- )
140
-
141
- # For training a user needs to pass their own set of transforms as a Callable.
142
- # For reference this is what was used in the original IDEFICS training:
143
- # transform = transforms.Compose([
144
- # convert_to_rgb,
145
- # transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
146
- # transforms.ToTensor(),
147
- # transforms.Normalize(mean=image_mean, std=image_std),
148
- # ])
149
- if transform is not None:
150
- if not is_torch_available():
151
- raise ImportError("To pass in `transform` torch must be installed")
152
- import torch
153
-
154
- images = [transform(x) for x in images]
155
- return torch.stack(images)
156
-
157
- # for inference we do the exact transforms that were used to train IDEFICS
158
- images = [convert_to_rgb(x) for x in images]
159
- # further transforms expect numpy arrays
160
- images = [to_numpy_array(x) for x in images]
161
- images = [resize(x, size, resample=PILImageResampling.BICUBIC) for x in images]
162
- images = [self.rescale(image=image, scale=1 / 255) for image in images]
163
- images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
164
- images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
165
- # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
166
- images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
167
-
168
- return images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
image_processing_img2html.py DELETED
@@ -1,168 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Image processor class for Img2HTML."""
16
-
17
- from typing import Callable, Dict, List, Optional, Union
18
-
19
- from PIL import Image
20
-
21
- from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
22
- from transformers.image_transforms import resize, to_channel_dimension_format
23
- from transformers.image_utils import (
24
- ChannelDimension,
25
- ImageInput,
26
- PILImageResampling,
27
- make_list_of_images,
28
- to_numpy_array,
29
- valid_images,
30
- )
31
- from transformers.utils import TensorType, is_torch_available
32
-
33
-
34
- IMG2HTML_STANDARD_MEAN = [0.5, 0.5, 0.5]
35
- IMG2HTML_STANDARD_STD = [0.5, 0.5, 0.5]
36
-
37
-
38
- def convert_to_rgb(image):
39
- # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background
40
- # for transparent images. The call to `alpha_composite` handles this case
41
- if image.mode == "RGB":
42
- return image
43
-
44
- image_rgba = image.convert("RGBA")
45
- background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
46
- alpha_composite = Image.alpha_composite(background, image_rgba)
47
- alpha_composite = alpha_composite.convert("RGB")
48
- return alpha_composite
49
-
50
-
51
- class Img2HTMLImageProcessor(BaseImageProcessor):
52
- r"""
53
- Constructs a Img2HTML image processor.
54
-
55
- Args:
56
- image_size (`int`, *optional*, defaults to 224):
57
- Resize to image size
58
- image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
59
- Mean to use if normalizing the image. This is a float or list of floats the length of the number of
60
- channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
61
- overridden by the `image_mean` parameter in the `preprocess` method.
62
- image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
63
- Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
64
- number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
65
- Can be overridden by the `image_std` parameter in the `preprocess` method.
66
- image_num_channels (`int`, *optional*, defaults to 3):
67
- Number of image channels.
68
- """
69
-
70
- model_input_names = ["pixel_values"]
71
-
72
- def __init__(
73
- self,
74
- image_size: int = 224,
75
- image_mean: Optional[Union[float, List[float]]] = None,
76
- image_std: Optional[Union[float, List[float]]] = None,
77
- image_num_channels: Optional[int] = 3,
78
- **kwargs,
79
- ) -> None:
80
- super().__init__(**kwargs)
81
-
82
- self.image_size = image_size
83
- self.image_num_channels = image_num_channels
84
- self.image_mean = image_mean
85
- self.image_std = image_std
86
-
87
- def preprocess(
88
- self,
89
- images: ImageInput,
90
- image_num_channels: Optional[int] = 3,
91
- image_size: Optional[Dict[str, int]] = None,
92
- image_mean: Optional[Union[float, List[float]]] = None,
93
- image_std: Optional[Union[float, List[float]]] = None,
94
- transform: Callable = None,
95
- **kwargs,
96
- ) -> TensorType.PYTORCH:
97
- """
98
- Preprocess a batch of images.
99
-
100
- Args:
101
- images (`ImageInput`):
102
- A list of images to preprocess.
103
- image_size (`int`, *optional*, defaults to `self.image_size`):
104
- Resize to image size
105
- image_num_channels (`int`, *optional*, defaults to `self.image_num_channels`):
106
- Number of image channels.
107
- image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`):
108
- Mean to use if normalizing the image. This is a float or list of floats the length of the number of
109
- channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can
110
- be overridden by the `image_mean` parameter in the `preprocess` method.
111
- image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`):
112
- Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
113
- number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess`
114
- method. Can be overridden by the `image_std` parameter in the `preprocess` method.
115
- transform (`Callable`, *optional*, defaults to `None`):
116
- A custom transform function that accepts a single image can be passed for training. For example,
117
- `torchvision.Compose` can be used to compose multiple transforms. If `None` - an inference mode is
118
- assumed - and then a preset of inference-specific transforms will be applied to the images
119
-
120
- Returns:
121
- a PyTorch tensor of the processed images
122
-
123
- """
124
- image_size = image_size if image_size is not None else self.image_size
125
- image_num_channels = image_num_channels if image_num_channels is not None else self.image_num_channels
126
- image_mean = image_mean if image_mean is not None else self.image_mean
127
- image_std = image_std if image_std is not None else self.image_std
128
- size = (image_size, image_size)
129
-
130
- if isinstance(images, list) and len(images) == 0:
131
- return []
132
-
133
- images = make_list_of_images(images)
134
-
135
- if not valid_images(images):
136
- raise ValueError(
137
- "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
138
- "torch.Tensor, tf.Tensor or jax.ndarray."
139
- )
140
-
141
- # For training a user needs to pass their own set of transforms as a Callable.
142
- # For reference this is what was used in the original IDEFICS training:
143
- # transform = transforms.Compose([
144
- # convert_to_rgb,
145
- # transforms.RandomResizedCrop((size, size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BILINEAR),
146
- # transforms.ToTensor(),
147
- # transforms.Normalize(mean=image_mean, std=image_std),
148
- # ])
149
- if transform is not None:
150
- if not is_torch_available():
151
- raise ImportError("To pass in `transform` torch must be installed")
152
- import torch
153
-
154
- images = [transform(x) for x in images]
155
- return torch.stack(images)
156
-
157
- # for inference we do the exact transforms that were used to train IDEFICS
158
- images = [convert_to_rgb(x) for x in images]
159
- # further transforms expect numpy arrays
160
- images = [to_numpy_array(x) for x in images]
161
- images = [resize(x, size, resample=PILImageResampling.BILINEAR) for x in images]
162
- images = [self.rescale(image=image, scale=1 / 255) for image in images]
163
- images = [self.normalize(x, mean=image_mean, std=image_std) for x in images]
164
- images = [to_channel_dimension_format(x, ChannelDimension.FIRST) for x in images]
165
- # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
166
- images = BatchFeature(data={"pixel_values": images}, tensor_type=TensorType.PYTORCH)["pixel_values"]
167
-
168
- return images
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processing_idefics.py DELETED
@@ -1,414 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 The HuggingFace Inc. team.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """
16
- Processor class for IDEFICS.
17
- """
18
-
19
- from typing import Callable, List, Optional, Union
20
- from urllib.parse import urlparse
21
-
22
- from ...feature_extraction_utils import BatchFeature
23
- from ...processing_utils import ProcessorMixin
24
- from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
25
- from ...utils import TensorType, is_torch_available
26
-
27
-
28
- if is_torch_available():
29
- import torch
30
-
31
-
32
- IMAGE_TOKEN = "<image>"
33
-
34
-
35
- # copied from m4.training.packing
36
- def incremental_to_binary_attention_mask(incremental_mask, num_classes=-1):
37
- # This function converts: [-1, 0, 1] => [[0, 0], [1, 0], [0, 1]]
38
-
39
- # If any of images index are more than num_classes, set them to -1.
40
- # Words after the max number of images allowed have been seen don't attend on anything
41
- if num_classes != -1:
42
- incremental_mask[incremental_mask >= num_classes] = -1
43
-
44
- negatives = incremental_mask == -1
45
- incremental_mask[negatives] = 0
46
- attn_mask = torch.nn.functional.one_hot(incremental_mask, num_classes=num_classes)
47
- attn_mask[negatives, :] = 0
48
- return attn_mask
49
-
50
-
51
- # copied from m4.training.packing
52
- def image_attention_mask_for_packed_input_ids(input_ids, tokenizer):
53
- image_attention_mask = torch.full_like(input_ids, fill_value=-1)
54
- next_image_attention_mask = torch.full_like(input_ids, fill_value=-1)
55
- image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
56
- eod_token_id = tokenizer.eos_token_id
57
- for batch_idx in range(input_ids.size(0)):
58
- count = -1
59
- seen_eod = False
60
- for idx, token_id in enumerate(input_ids[batch_idx]):
61
- if token_id == image_token_id:
62
- count += 1
63
- image_attention_mask[batch_idx][idx] = count
64
- seen_eod = False
65
- else:
66
- image_attention_mask[batch_idx][idx] = count
67
-
68
- if seen_eod:
69
- image_attention_mask[batch_idx][idx] = -1
70
-
71
- if token_id == eod_token_id:
72
- seen_eod = True
73
-
74
- for batch_idx in range(input_ids.size(0)):
75
- count = -1
76
- seen_eod = False
77
- for idx in range(input_ids[batch_idx].size(0) - 1, -1, -1):
78
- token_id = input_ids[batch_idx][idx]
79
- if token_id == image_token_id:
80
- count += 1
81
- next_image_attention_mask[batch_idx][idx] = count
82
- seen_eod = False
83
- else:
84
- next_image_attention_mask[batch_idx][idx] = count
85
-
86
- if token_id == eod_token_id:
87
- seen_eod = True
88
-
89
- if seen_eod:
90
- next_image_attention_mask[batch_idx][idx] = -1
91
-
92
- non_negative_indices = next_image_attention_mask[batch_idx] != -1
93
- next_image_attention_mask[batch_idx][non_negative_indices] -= count
94
- next_image_attention_mask[batch_idx][non_negative_indices] *= -1
95
-
96
- return image_attention_mask, next_image_attention_mask
97
-
98
-
99
- def is_url(string):
100
- """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
101
- invalidated the url"""
102
- if " " in string:
103
- return False
104
- result = urlparse(string)
105
- return all([result.scheme, result.netloc])
106
-
107
-
108
- class IdeficsProcessor(ProcessorMixin):
109
- r"""
110
- Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
111
-
112
- [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
113
- the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
114
-
115
- Args:
116
- image_processor (`IdeficsImageProcessor`):
117
- An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
118
- tokenizer (`LlamaTokenizerFast`):
119
- An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
120
- image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
121
- """
122
-
123
- attributes = ["image_processor", "tokenizer"]
124
- image_processor_class = "IdeficsImageProcessor"
125
- tokenizer_class = "LlamaTokenizerFast"
126
-
127
- def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
128
- if image_processor is None:
129
- raise ValueError("You need to specify an `image_processor`.")
130
- if tokenizer is None:
131
- raise ValueError("You need to specify a `tokenizer`.")
132
-
133
- super().__init__(image_processor, tokenizer)
134
- self.current_processor = self.image_processor
135
- self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
136
-
137
- self.default_image_dims = (
138
- self.image_processor.image_num_channels,
139
- self.image_processor.image_size,
140
- self.image_processor.image_size,
141
- )
142
-
143
- self.tokenizer_was_trained_with_end_of_utterance_token = (
144
- True
145
- if "<end_of_utterance>" in self.tokenizer.special_tokens_map.get("additional_special_tokens", [])
146
- else False
147
- )
148
-
149
- def __call__(
150
- self,
151
- prompts: Union[List[TextInput], List[List[TextInput]]],
152
- padding: Union[bool, str, PaddingStrategy] = False,
153
- truncation: Union[bool, str, TruncationStrategy] = None,
154
- max_length: Optional[int] = None,
155
- transform: Callable = None,
156
- add_eos_token=False,
157
- add_end_of_utterance_token=None,
158
- debug=False,
159
- return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
160
- ) -> BatchEncoding:
161
- """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
162
- the model was trained on and prepares the image pixel values for the model to process.
163
-
164
- Args:
165
- prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
166
- either a single prompt or a batched list of prompts - see the detailed description immediately after
167
- the end of the arguments doc section.
168
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
169
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
170
- index) among:
171
- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
172
- sequence if provided).
173
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
174
- acceptable input length for the model if that argument is not provided.
175
- - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
176
- lengths).
177
- max_length (`int`, *optional*):
178
- Maximum length of the returned list and optionally padding length (see above).
179
- truncation (`bool`, *optional*):
180
- Activates truncation to cut input sequences longer than `max_length` to `max_length`.
181
- transform (`Callable`, *optional*):
182
- A custom transform function that accepts a single image can be passed for training. For example,
183
- `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
184
- set of transforms will be applied to the images
185
- add_eos_token (`bool`, *optional*, defaults to `False`):
186
- Adds `eos_token` at the end of the final prompt if True`
187
- add_end_of_utterance_token (`bool`, *optional*)
188
- Whether to automatically add `<end_of_utterance>` after each prompt's text input (unless followed by an
189
- image). If `None` the tokenizer will be checked instead and if this token is found in
190
- `additional_special_tokens` then the value will be `True`.
191
- debug (`bool`, *optional*, defaults to `False`):
192
- `True` value will help debug prompt generation by dumping useful information
193
- return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
194
- The type of tensors to return. Can be one of:
195
- - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
196
-
197
- Returns:
198
- a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
199
- directly passed to `model.generate`
200
-
201
- Detailed explanation:
202
-
203
- Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
204
-
205
- An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
206
-
207
- When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
208
- entry into the prompt.
209
-
210
- Example:
211
-
212
- ```python
213
- checkpoint = "HuggingFaceM4/idefics-9b"
214
- processor = AutoProcessor.from_pretrained(checkpoint)
215
- url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
216
- img = processor.image_processor.fetch_images([url])[0]
217
-
218
- prompts = [
219
- "User:",
220
- img,
221
- "Describe this image.\nAssistant: An image of two kittens in grass.\n",
222
- "User:",
223
- "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
224
- "Describe this image.\nAssistant:",
225
- ]
226
-
227
- inputs = processor(prompts, return_tensors="pt")
228
- generated_ids = model.generate(**inputs, max_length=100)
229
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
230
- ```
231
-
232
- In this example the `prompts` will be converted into:
233
-
234
- ```
235
- <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
236
- Assistant: An image of two kittens in grass.
237
- User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
238
- Assistant:'
239
- ```
240
-
241
- and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
242
- `pixel_values` dict entry of the return value.
243
-
244
- This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
245
- first image is passed as object and the second one as a url.
246
-
247
- To do training do:
248
-
249
- ```python
250
- image_transform = transforms.Compose(
251
- [
252
- transforms.RandomResizedCrop(
253
- (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
254
- ),
255
- transforms.ToTensor(),
256
- transforms.Normalize(mean=self.image_mean, std=self.image_std),
257
- ]
258
- )
259
- inputs = processor(prompts, transform=image_transform, return_tensors="pt")
260
- ```
261
-
262
- In order to help debug prompt generation enable `debug=True` which will show you what's happening.
263
-
264
- """
265
-
266
- # if the value isn't overriden by the user, check if the tokenizer was trained with this token and then use it
267
- if add_end_of_utterance_token is None:
268
- add_end_of_utterance_token = self.tokenizer_was_trained_with_end_of_utterance_token
269
-
270
- # turn non-batched prompts into batched
271
- if not any(isinstance(i, list) for i in prompts):
272
- prompts = [prompts]
273
-
274
- fake_token = "<fake_token_around_image>"
275
- image_token = "<image>"
276
- end_of_utterance_token = "<end_of_utterance>"
277
-
278
- def image_tokens(last_was_image):
279
- if last_was_image:
280
- return image_token + fake_token
281
- else:
282
- return fake_token + image_token + fake_token
283
-
284
- all_prompts = []
285
- all_images = []
286
- for sample in prompts:
287
- # the model was trained on samples starting with <s>
288
- full_text = f"{self.tokenizer.bos_token}"
289
-
290
- # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
291
- image_objects = []
292
- last_was_image = False
293
- last_was_text = False
294
- for i, item in enumerate(sample):
295
- if i > 0:
296
- last_was_text = True if not last_was_image else False
297
-
298
- if isinstance(item, str):
299
- item = item.strip(" ")
300
- if is_url(item):
301
- image = self.image_processor.fetch_images(item)
302
- full_text += image_tokens(last_was_image)
303
- image_objects.append(image)
304
- last_was_image = True
305
- else:
306
- # we add end_of_utterance_token between each subsequent text prompts (but not at the last one!)
307
- if add_end_of_utterance_token and last_was_text:
308
- full_text += end_of_utterance_token
309
- full_text += item
310
- last_was_image = False
311
- else:
312
- # must be an image obj
313
- full_text += image_tokens(last_was_image)
314
- image_objects.append(item)
315
- last_was_image = True
316
-
317
- if add_eos_token:
318
- full_text += self.tokenizer.eos_token
319
-
320
- if debug is True:
321
- print(f"{full_text=}")
322
-
323
- image_objects = self.image_processor(image_objects, transform=transform)
324
-
325
- all_prompts.append(full_text)
326
- all_images.append(image_objects)
327
-
328
- text_encoding = self.tokenizer(
329
- text=all_prompts,
330
- add_special_tokens=False,
331
- padding=padding,
332
- truncation=truncation,
333
- max_length=max_length,
334
- )
335
- all_texts = text_encoding["input_ids"]
336
-
337
- max_seq_len = max(len(x) for x in all_texts)
338
-
339
- # max_num_images has to be at least 1 even when there are no images
340
- max_num_images = max(len(x) for x in all_images)
341
- max_num_images = max(1, max_num_images)
342
-
343
- at_least_one_image = sum(len(x) for x in all_images) > 0
344
- output_input_ids = []
345
- output_images = []
346
- output_attention_masks = []
347
- for text, images in zip(all_texts, all_images):
348
- padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
349
- unpadded_seq_len = len(text)
350
- start = max_seq_len - unpadded_seq_len
351
- padded_input_ids[start:] = text[:max_seq_len]
352
-
353
- attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
354
- attention_mask[start:] = 1
355
-
356
- image_count = padded_input_ids.count(self.image_token_id)
357
- local_max_num_images = min(image_count, max_num_images)
358
-
359
- current_images = images[:local_max_num_images]
360
-
361
- if len(current_images) > 0:
362
- padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
363
- padded_image_tensor[: current_images.size(0)] = current_images
364
- else:
365
- padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
366
-
367
- output_images.append(padded_image_tensor)
368
- output_input_ids.append(torch.tensor(padded_input_ids))
369
-
370
- output_attention_masks.append(attention_mask)
371
-
372
- output_input_ids = torch.stack(output_input_ids)
373
- output_images = torch.stack(output_images)
374
- output_attention_masks = torch.stack(output_attention_masks)
375
-
376
- if at_least_one_image:
377
- image_attention_mask, _ = image_attention_mask_for_packed_input_ids(output_input_ids, self.tokenizer)
378
- image_attention_mask = incremental_to_binary_attention_mask(
379
- image_attention_mask, num_classes=max_num_images
380
- )
381
- else:
382
- # in full language mode we set the image mask to all-0s
383
- image_attention_mask = torch.zeros(
384
- output_input_ids.shape[0], output_input_ids.shape[1], 1, dtype=torch.bool
385
- )
386
-
387
- return BatchFeature(
388
- data={
389
- "input_ids": output_input_ids,
390
- "attention_mask": output_attention_masks,
391
- "pixel_values": output_images,
392
- "image_attention_mask": image_attention_mask,
393
- }
394
- )
395
-
396
- def batch_decode(self, *args, **kwargs):
397
- """
398
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
399
- refer to the docstring of this method for more information.
400
- """
401
- return self.tokenizer.batch_decode(*args, **kwargs)
402
-
403
- def decode(self, *args, **kwargs):
404
- """
405
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
406
- the docstring of this method for more information.
407
- """
408
- return self.tokenizer.decode(*args, **kwargs)
409
-
410
- @property
411
- def model_input_names(self):
412
- tokenizer_input_names = self.tokenizer.model_input_names
413
- image_processor_input_names = self.image_processor.model_input_names
414
- return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
processing_img2html.py DELETED
@@ -1,345 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2022 The HuggingFace Inc. team.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """
16
- Processor class for Img2HTML.
17
- """
18
-
19
- from typing import Callable, List, Optional, Union
20
- from urllib.parse import urlparse
21
-
22
- from transformers.feature_extraction_utils import BatchFeature
23
- from transformers.processing_utils import ProcessorMixin
24
- from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy
25
- from transformers.utils import TensorType, is_torch_available
26
-
27
- from .image_processing_img2html import Img2HTMLImageProcessor
28
-
29
- if is_torch_available():
30
- import torch
31
-
32
-
33
- IMAGE_TOKEN = "<image>"
34
-
35
-
36
- def is_url(string):
37
- """Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
38
- invalidated the url"""
39
- if " " in string:
40
- return False
41
- result = urlparse(string)
42
- return all([result.scheme, result.netloc])
43
-
44
- class Img2HTMLProcessor(ProcessorMixin):
45
- r"""
46
- Constructs a Img2HTML processor which wraps a LLama tokenizer and Img2HTML image processor into a single processor.
47
-
48
- [`Img2HTMLProcessor`] offers all the functionalities of [`Img2HTMLImageProcessor`] and [`LlamaTokenizerFast`]. See
49
- the docstring of [`~Img2HTMLProcessor.__call__`] and [`~Img2HTMLProcessor.decode`] for more information.
50
-
51
- Args:
52
- image_processor (`Img2HTMLImageProcessor`):
53
- An instance of [`Img2HTMLImageProcessor`]. The image processor is a required input.
54
- tokenizer (`LlamaTokenizerFast`):
55
- An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
56
- image_size (`int`, *optional*, defaults to 224): Image size (assuming a square image)
57
- """
58
-
59
- attributes = ["image_processor", "tokenizer"]
60
- image_processor_class = "Img2HTMLImageProcessor"
61
- tokenizer_class = "LlamaTokenizerFast"
62
-
63
- def __init__(self, image_processor, tokenizer=None, image_size=960, **kwargs):
64
- if image_processor is None:
65
- raise ValueError("You need to specify an `image_processor`.")
66
- if tokenizer is None:
67
- raise ValueError("You need to specify a `tokenizer`.")
68
-
69
- super().__init__(image_processor, tokenizer)
70
- self.current_processor = self.image_processor
71
- self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
72
-
73
- self.default_image_dims = (
74
- self.image_processor.image_num_channels,
75
- self.image_processor.image_size,
76
- self.image_processor.image_size,
77
- )
78
-
79
- # @classmethod
80
- # def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
81
- # # Hack overriding things
82
- # from pathlib import Path
83
- # from transformers.utils import direct_transformers_import
84
- # # Dynamically import the Transformers module to grab the attribute classes of the processor form their names.
85
- # transformers_module = direct_transformers_import(Path(__file__).parent)
86
-
87
- # args = []
88
- # for attribute_name in cls.attributes:
89
- # class_name = getattr(cls, f"{attribute_name}_class")
90
- # if isinstance(class_name, tuple):
91
- # classes = tuple(getattr(transformers_module, n) if n is not None else None for n in class_name)
92
- # use_fast = kwargs.get("use_fast", True)
93
- # if use_fast and classes[1] is not None:
94
- # attribute_class = classes[1]
95
- # else:
96
- # attribute_class = classes[0]
97
- # else:
98
- # if class_name == "Img2HTMLImageProcessor":
99
- # attribute_class = Img2HTMLImageProcessor
100
- # else:
101
- # attribute_class = getattr(transformers_module, class_name)
102
-
103
- # args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
104
- # return args
105
-
106
- def __call__(
107
- self,
108
- prompts: Union[List[TextInput], List[List[TextInput]]],
109
- padding: Union[bool, str, PaddingStrategy] = False,
110
- truncation: Union[bool, str, TruncationStrategy] = None,
111
- max_length: Optional[int] = None,
112
- transform: Callable = None,
113
- add_eos_token=False,
114
- debug=False,
115
- return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
116
- ) -> BatchEncoding:
117
- """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
118
- the model was trained on and prepares the image pixel values for the model to process.
119
-
120
- Args:
121
- prompts (`Union[List[TextInput], [List[List[TextInput]]]]`):
122
- either a single prompt or a batched list of prompts - see the detailed description immediately after
123
- the end of the arguments doc section.
124
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
125
- Select a strategy to pad the returned sequences (according to the model's padding side and padding
126
- index) among:
127
- - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
128
- sequence if provided).
129
- - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
130
- acceptable input length for the model if that argument is not provided.
131
- - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
132
- lengths).
133
- max_length (`int`, *optional*):
134
- Maximum length of the returned list and optionally padding length (see above).
135
- truncation (`bool`, *optional*):
136
- Activates truncation to cut input sequences longer than `max_length` to `max_length`.
137
- transform (`Callable`, *optional*):
138
- A custom transform function that accepts a single image can be passed for training. For example,
139
- `torchvision.Compose` can be used to compose multiple functions. If `None` a preset inference-specific
140
- set of transforms will be applied to the images
141
- add_eos_token (`bool`, *optional*, defaults to `False`):
142
- Adds `eos_token` at the end of the final prompt if True`
143
- debug (`bool`, *optional*, defaults to `False`):
144
- `True` value will help debug prompt generation by dumping useful information
145
- return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
146
- The type of tensors to return. Can be one of:
147
- - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
148
-
149
- Returns:
150
- a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
151
- directly passed to `model.generate`
152
-
153
- Detailed explanation:
154
-
155
- Each entry in `prompts` is either a text to be passed as is or an image that will be processed.
156
-
157
- An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
158
-
159
- When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
160
- entry into the prompt.
161
-
162
- Example:
163
-
164
- ```python
165
- checkpoint = "HuggingFaceM4/Img2HTML-9b"
166
- processor = AutoProcessor.from_pretrained(checkpoint)
167
- url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
168
- img = processor.image_processor.fetch_images([url])[0]
169
-
170
- prompts = [
171
- "User:",
172
- img,
173
- "Describe this image.\nAssistant: An image of two kittens in grass.\n",
174
- "User:",
175
- "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
176
- "Describe this image.\nAssistant:",
177
- ]
178
-
179
- inputs = processor(prompts, return_tensors="pt")
180
- generated_ids = model.generate(**inputs, max_length=100)
181
- generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
182
- ```
183
-
184
- In this example the `prompts` will be converted into:
185
-
186
- ```
187
- <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
188
- Assistant: An image of two kittens in grass.
189
- User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
190
- Assistant:'
191
- ```
192
-
193
- and the two images will be massaged using [`Img2HTMLImageProcessor.__call__`] method and placed inside the
194
- `pixel_values` dict entry of the return value.
195
-
196
- This example also examplifies that images can be passed as objects or as text urls. It can be seen that the
197
- first image is passed as object and the second one as a url.
198
-
199
- To do training do:
200
-
201
- ```python
202
- image_transform = transforms.Compose(
203
- [
204
- transforms.RandomResizedCrop(
205
- (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
206
- ),
207
- transforms.ToTensor(),
208
- transforms.Normalize(mean=self.image_mean, std=self.image_std),
209
- ]
210
- )
211
- inputs = processor(prompts, transform=image_transform, return_tensors="pt")
212
- ```
213
-
214
- In order to help debug prompt generation enable `debug=True` which will show you what's happening.
215
-
216
- """
217
- # turn non-batched prompts into batched
218
- if not any(isinstance(i, list) for i in prompts):
219
- prompts = [prompts]
220
-
221
- fake_token = "<fake_token_around_image>"
222
- image_token = "<image>"
223
-
224
- def image_tokens(last_was_image):
225
- if last_was_image:
226
- return image_token + fake_token
227
- else:
228
- return fake_token + image_token + fake_token
229
-
230
- all_prompts = []
231
- all_images = []
232
- for sample in prompts:
233
- # the model was trained on samples starting with <s>
234
- full_text = f"{self.tokenizer.bos_token}"
235
-
236
- # an image can either be an image object in the item or the url, everything else is a verbatim prompt text
237
- image_objects = []
238
- last_was_image = False
239
- last_was_text = False
240
- for i, item in enumerate(sample):
241
- if i > 0:
242
- last_was_text = True if not last_was_image else False
243
-
244
- if isinstance(item, str):
245
- item = item.strip(" ")
246
- if is_url(item):
247
- image = self.image_processor.fetch_images(item)
248
- full_text += image_tokens(last_was_image)
249
- image_objects.append(image)
250
- last_was_image = True
251
- else:
252
- full_text += item
253
- last_was_image = False
254
- else:
255
- # must be an image obj
256
- full_text += image_tokens(last_was_image)
257
- image_objects.append(item)
258
- last_was_image = True
259
-
260
- if add_eos_token:
261
- full_text += self.tokenizer.eos_token
262
-
263
- if debug is True:
264
- print(f"{full_text=}")
265
-
266
- image_objects = self.image_processor(image_objects, transform=transform)
267
-
268
- all_prompts.append(full_text)
269
- all_images.append(image_objects)
270
-
271
- text_encoding = self.tokenizer(
272
- text=all_prompts,
273
- add_special_tokens=False,
274
- padding=padding,
275
- truncation=truncation,
276
- max_length=max_length,
277
- )
278
- all_texts = text_encoding["input_ids"]
279
-
280
- max_seq_len = max(len(x) for x in all_texts)
281
-
282
- # max_num_images has to be at least 1 even when there are no images
283
- max_num_images = max(len(x) for x in all_images)
284
- max_num_images = max(1, max_num_images)
285
-
286
- output_input_ids = []
287
- output_images = []
288
- output_attention_masks = []
289
- for text, images in zip(all_texts, all_images):
290
- padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len
291
- unpadded_seq_len = len(text)
292
- start = max_seq_len - unpadded_seq_len
293
- padded_input_ids[start:] = text[:max_seq_len]
294
-
295
- attention_mask = torch.zeros((max_seq_len,), dtype=torch.long)
296
- attention_mask[start:] = 1
297
-
298
- image_count = padded_input_ids.count(self.image_token_id)
299
- local_max_num_images = min(image_count, max_num_images)
300
-
301
- current_images = images[:local_max_num_images]
302
-
303
- if len(current_images) > 0:
304
- padded_image_tensor = torch.zeros(max_num_images, *current_images.size()[1:])
305
- padded_image_tensor[: current_images.size(0)] = current_images
306
- else:
307
- padded_image_tensor = torch.zeros(max_num_images, *self.default_image_dims)
308
-
309
- output_images.append(padded_image_tensor)
310
- output_input_ids.append(torch.tensor(padded_input_ids))
311
-
312
- output_attention_masks.append(attention_mask)
313
-
314
- output_input_ids = torch.stack(output_input_ids)
315
- output_images = torch.stack(output_images)
316
- output_attention_masks = torch.stack(output_attention_masks)
317
-
318
-
319
- return BatchFeature(
320
- data={
321
- "input_ids": output_input_ids,
322
- "attention_mask": output_attention_masks,
323
- "pixel_values": output_images,
324
- }
325
- )
326
-
327
- def batch_decode(self, *args, **kwargs):
328
- """
329
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
330
- refer to the docstring of this method for more information.
331
- """
332
- return self.tokenizer.batch_decode(*args, **kwargs)
333
-
334
- def decode(self, *args, **kwargs):
335
- """
336
- This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
337
- the docstring of this method for more information.
338
- """
339
- return self.tokenizer.decode(*args, **kwargs)
340
-
341
- @property
342
- def model_input_names(self):
343
- tokenizer_input_names = self.tokenizer.model_input_names
344
- image_processor_input_names = self.image_processor.model_input_names
345
- return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))