paligemma-cpu-gguf

Sleeping

App Files Files Community

paligemma-cpu-gguf / paligemma_bv.py

mjlm

Initial commit.

dea4744 7 months ago

raw

history blame

6.11 kB

	"""Wraps `big_vision` PaliGemma model for easy use in demo."""

	from collections.abc import Callable
	import dataclasses
	from typing import Any

	import jax
	import jax.numpy as jnp
	import ml_collections
	import numpy as np
	import PIL.Image

	from big_vision import sharding
	from big_vision import utils
	from big_vision.models.proj.paligemma import paligemma
	from big_vision.pp import builder as pp_builder
	from big_vision.pp import ops_general # pylint: disable=unused-import
	from big_vision.pp import ops_image # pylint: disable=unused-import
	from big_vision.pp import ops_text # pylint: disable=unused-import
	from big_vision.pp import tokenizer
	from big_vision.pp.proj.paligemma import ops as ops_paligemma # pylint: disable=unused-import
	from big_vision.trainers.proj.paligemma import predict_fns


	mesh = jax.sharding.Mesh(jax.devices(), 'data')


	def _recover_bf16(x):
	if x.dtype == np.dtype('V2'):
	x = x.view('bfloat16')
	return x


	def _load(
	path, tokenizer_spec='gemma(tokensets=("loc", "seg"))', vocab_size=257_152
	):
	"""Loads model, params, decode functions and tokenizer."""
	tok = tokenizer.get_tokenizer(tokenizer_spec)

	config = ml_collections.FrozenConfigDict(dict(
	llm_model='proj.paligemma.gemma_bv',
	llm=dict(vocab_size=vocab_size, variant='gemma_2b'),
	img=dict(variant='So400m/14', pool_type='none', scan=True),
	))
	model = paligemma.Model(**config)
	decode = predict_fns.get_all(model)['decode']
	beam_decode = predict_fns.get_all(model)['beam_decode']

	params_cpu = paligemma.load(None, path, config)
	# Some numpy versions don't load bfloat16 correctly:
	params_cpu = jax.tree.map(_recover_bf16, params_cpu)

	return model, params_cpu, decode, beam_decode, tok


	def _shard_params(params_cpu):
	"""Shards `params_cpu` with fsdp strategy on all available devices."""
	params_sharding = sharding.infer_sharding(
	params_cpu, strategy=[('.*', 'fsdp(axis="data")')], mesh=mesh
	)
	params = jax.tree.map(utils.reshard, params_cpu, params_sharding)
	return params


	def _pil2np(img):
	"""Accepts `PIL.Image` or `np.ndarray` and returns `np.ndarray`."""
	if isinstance(img, PIL.Image.Image):
	img = np.array(img)
	img = img[..., :3]
	if img.ndim == 2:
	img = img[..., None]
	if img.shape[-1] == 1:
	img = np.repeat(img, 3, axis=-1)
	return img


	def _prepare_batch(
	images,
	prefixes,
	*,
	res=224,
	tokenizer_spec='gemma(tokensets=("loc", "seg"))',
	suffixes=None,
	text_len=64,
	):
	"""Returns non-sharded batch."""

	pp_fn = pp_builder.get_preprocess_fn('\|'.join([
	f'resize({res}, antialias=True)\|value_range(-1, 1)',
	f"tok(key='prefix', bos='yes', model='{tokenizer_spec}')",
	f"tok(key='septok', text='\\n', model='{tokenizer_spec}')",
	f"tok(key='suffix', model='{tokenizer_spec}')",
	'masked_concat(["prefix", "septok", "suffix"], mask_ar=[0, 0, 1], mask_input=[1, 1, 1])', # pylint: disable=line-too-long
	f'tolen({text_len}, pad_value=0, key="text")',
	f'tolen({text_len}, pad_value=1, key="mask_ar")',
	f'tolen({text_len}, pad_value=0, key="mask_input")',
	'keep("image", "text", "mask_ar", "mask_input")',
	]), log_data=False)
	assert not isinstance(prefixes, str), f'expected batch: {prefixes}'
	assert (
	isinstance(images, (list, tuple)) or images.ndim == 4
	), f'expected batch: {images.shape}'
	if suffixes is None:
	suffixes = [''] * len(prefixes)
	assert len(prefixes) == len(suffixes) == len(images)
	examples = [{'_mask': True, **pp_fn({
	'image': np.asarray(_pil2np(image)),
	'prefix': np.array(prefix),
	'suffix': np.array(suffix),
	})} for image, prefix, suffix in zip(images, prefixes, suffixes)]
	batch = jax.tree_map(lambda xs: np.stack(xs), examples)
	return batch


	def _shard_batch(batch, n=None):
	"""Shards `batch` with fsdp strategy on all available devices."""
	if n is None:
	n = jax.local_device_count()
	def pad(x):
	return jnp.pad(x, [(0, -len(x) % n)] + [(0, 0)] * (x.ndim - 1))
	batch = {k: pad(v) for k, v in batch.items()}
	data_sharding = jax.sharding.NamedSharding(
	mesh, jax.sharding.PartitionSpec('data')
	)
	batch_on_device = utils.reshard(batch, data_sharding)
	return batch_on_device


	@dataclasses.dataclass(frozen=True, kw_only=True, order=True)
	class PaligemmaConfig:
	"""Desribes a `big_vision` PaliGemma model."""

	ckpt: str
	res: int
	text_len: int
	tokenizer: str
	vocab_size: int


	@dataclasses.dataclass(frozen=True, kw_only=True)
	class PaliGemmaModel:
	"""Wraps a `big_vision` PaliGemma model."""

	config: PaligemmaConfig
	tokenizer: tokenizer.Tokenizer
	decode: Callable[..., Any]
	beam_decode: Callable[..., Any]

	@classmethod
	def shard_batch(cls, batch):
	return _shard_batch(batch)

	@classmethod
	def shard_params(cls, params_cpu):
	return _shard_params(params_cpu)

	def prepare_batch(self, images, texts, suffixes=None):
	return _prepare_batch(
	images=images,
	prefixes=texts,
	suffixes=suffixes,
	res=self.config.res,
	tokenizer_spec=self.config.tokenizer,
	text_len=self.config.text_len,
	)

	def predict(
	self,
	params,
	batch,
	devices=None,
	max_decode_len=128,
	sampler='greedy',
	**kw,
	):
	"""Returns tokens."""
	if devices is None:
	devices = jax.devices()
	if sampler == 'beam':
	decode = self.beam_decode
	else:
	decode = self.decode
	kw['sampler'] = sampler
	return decode(
	{'params': params},
	batch=batch,
	devices=devices,
	eos_token=self.tokenizer.eos_token,
	max_decode_len=max_decode_len,
	**kw,
	)


	ParamsCpu = Any


	def load_model(config: PaligemmaConfig) -> tuple[PaliGemmaModel, ParamsCpu]:
	"""Loads model from config."""
	model, params_cpu, decode, beam_decode, tok = _load(
	path=config.ckpt,
	tokenizer_spec=config.tokenizer,
	vocab_size=config.vocab_size,
	)
	del model
	return PaliGemmaModel(
	config=config, tokenizer=tok, decode=decode, beam_decode=beam_decode,
	), params_cpu