File size: 5,896 Bytes

ab80423

from functools import partial

import torch
import torch.nn.functional as F
from transformers.processing_utils import ProcessorMixin
from transformers.image_processing_utils import BaseImageProcessor
from transformers import AutoTokenizer, AutoConfig
from transformers import BatchFeature

from PIL import Image
from torchvision.transforms import (
    Compose,
    Normalize,
    Resize,
    ToTensor
)


IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)


def convert_to_rgb(x):
    return x.convert("RGB")


def expand2square(image, background_color):
    width, height = image.size
    if width == height:
        return image
    elif width > height:
        result = Image.new(image.mode, (width, width), background_color)
        result.paste(image, (0, (width - height) // 2))
        return result
    else:
        result = Image.new(image.mode, (height, height), background_color)
        result.paste(image, ((height - width) // 2, 0))
        return result


class ImageProcessor(BaseImageProcessor):
    def __init__(
        self,
        image_size: int,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.transform = Compose(
            [
                convert_to_rgb,
                partial(
                    expand2square,
                    background_color=tuple(int(255 * v) for v in IMAGENET_MEAN)
                ),
                Resize(image_size),
                ToTensor(),
                Normalize(
                    mean=IMAGENET_MEAN,
                    std=IMAGENET_STD,
                ),
            ]
        )
    
    def preprocess(
        self,
        image: Image
    ):
        return self.transform(image)

    def __repr__(self):
        return repr(self.transform)


class VLMProcessor(ProcessorMixin):
    def __init__(self, config):
        self.config = config
        self.image_size = config.image_size
        
        self.feature_extractor = ImageProcessor(self.image_size)
        self.tokenizer = AutoTokenizer.from_pretrained(
            config.text_decoder_name_or_path, additional_special_tokens=["<image>"]
        )
        self.tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
        self.num_image_latents = config.num_image_latents
        # super().__init__(self.image_processor, self.tokenizer)

    def __call__(
        self, text=None, images=None, **kwargs
    ):
        if text is not None:
            if isinstance(text, str):
                text = [text]

            tokenized_texts = []
            for t in text:
                messages = [
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": f" <image> {t}"},
                ]
                tokenized_prompt = self.tokenizer.apply_chat_template(
                    messages, add_generation_prompt=True, return_tensors="pt"
                )

                tokenized_texts.append(tokenized_prompt)

            max_len = max(len(t[0]) for t in tokenized_texts)
            input_ids = torch.full(
                (len(tokenized_texts), max_len),
                fill_value=self.tokenizer.pad_token_id,
                dtype=torch.int64,
            )
            attention_mask = torch.full(
                (len(tokenized_texts), max_len), fill_value=0, dtype=torch.int64
            )

            for i, tokens in enumerate(tokenized_texts):
                input_ids[i, -len(tokens[0]) :] = tokens[0]
                attention_mask[i, -len(tokens[0]) :] = 1

            attention_mask = F.pad(
                attention_mask, pad=(0, self.num_image_latents - 1), value=1
            )

            encoding = BatchFeature(
                data={"input_ids": input_ids, "attention_mask": attention_mask}
            )

        if images is not None:
            if isinstance(images, (list, tuple)):
                image_features = torch.empty(
                    (len(images), 3, self.image_size , self.image_size),
                    dtype=torch.float32,
                )

                for i, image in enumerate(images):
                    image_features[i] = self.feature_extractor(image)

            else:
                image_features = self.image_processor(images).unsqueeze(0)

        if text is not None and images is not None:
            encoding["images"] = image_features
            return encoding

        elif text is not None:
            return encoding

        else:
            return BatchFeature(
                data={
                    "images": image_features,
                },
                tensor_type=return_tensors,
            )

    def batch_decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        """
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        """
        This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        """
        return self.tokenizer.decode(*args, **kwargs)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path,
        trust_remote_code=False,
        **kwargs
    ):
        config = AutoConfig.from_pretrained(
            pretrained_model_name_or_path,
            trust_remote_code=trust_remote_code
        )
        return cls(config)