Video-LLaVA

Runtime error

Video-LLaVA / llava /model /multimodal_encoder /languagebind /image /processing_image.py

LinB203

61f3f56 about 1 year ago

3.33 kB

	import torch
	from PIL import Image
	from torchvision import transforms
	from transformers import ProcessorMixin, BatchEncoding
	from transformers.image_processing_utils import BatchFeature

	OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
	OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)

	def make_list_of_images(x):
	if not isinstance(x, list):
	return [x]
	return x

	def get_image_transform(config):
	config = config.vision_config
	transform = transforms.Compose(
	[
	transforms.ToTensor(),
	transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
	transforms.CenterCrop(224),
	transforms.Normalize(OPENAI_DATASET_MEAN, OPENAI_DATASET_STD) # assume image
	]
	)
	return transform


	def load_and_transform_image(image_path, transform):
	image = Image.open(image_path).convert('RGB') if isinstance(image_path, str) else image_path
	image_outputs = transform(image)
	return image_outputs

	class LanguageBindImageProcessor(ProcessorMixin):
	attributes = []
	tokenizer_class = ("LanguageBindImageTokenizer")

	def __init__(self, config, tokenizer=None, **kwargs):
	super().__init__(**kwargs)
	self.config = config
	self.transform = get_image_transform(config)
	self.image_processor = load_and_transform_image
	self.tokenizer = tokenizer
	self.image_mean = OPENAI_DATASET_MEAN
	self.crop_size = {'height': 224, 'width': 224}

	def __call__(self, images=None, text=None, context_length=77, return_tensors=None, **kwargs):
	if text is None and images is None:
	raise ValueError("You have to specify either text or images. Both cannot be none.")

	if text is not None:
	encoding = self.tokenizer(text, max_length=context_length, padding='max_length',
	truncation=True, return_tensors=return_tensors, **kwargs)

	if images is not None:
	images = make_list_of_images(images)
	image_features = [self.image_processor(image, self.transform) for image in images]
	image_features = torch.stack(image_features)

	if text is not None and images is not None:
	encoding["pixel_values"] = image_features
	return encoding
	elif text is not None:
	return encoding
	else:
	return {"pixel_values": image_features}

	def preprocess(self, images, return_tensors):
	return self.__call__(images=images, return_tensors=return_tensors)

	def batch_decode(self, skip_special_tokens=True, args, *kwargs):
	"""
	This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
	refer to the docstring of this method for more information.
	"""
	return self.tokenizer.batch_decode(args, skip_special_tokens=skip_special_tokens, *kwargs)

	def decode(self, skip_special_tokens=True, args, *kwargs):
	"""
	This method forwards all its arguments to CLIPTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
	the docstring of this method for more information.
	"""
	return self.tokenizer.decode(args, skip_special_tokens=skip_special_tokens, *kwargs)