Apollo-LMMs-Apollo-1_5B-t32 / modeling_apollo.py
GoodiesHere's picture
Upload folder using huggingface_hub
4b87979 verified
from typing import List, Optional, Tuple, Union
import warnings, os, torch
import torch.nn as nn
from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_utils import ContextManagers, no_init_weights
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.generation.utils import GenerateOutput
from .configuration_apollo import ApolloConfig
from .vision_tower import ApolloVisionTower
from .mm_connector import MMConnector
IGNORE_INDEX = -100
X_TOKEN_INDEX = -200
def get_model_config(config):
default_keys = ["llm_cfg", "vision_tower_cfg", "mm_connector_cfg"]
if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
root_path = config._name_or_path
else:
root_path = config.resume_path
return_pths = []
for key in default_keys:
cfg = getattr(config, key, None)
if isinstance(cfg, dict):
try:
return_pths.append(os.path.join(root_path, key[:-4]))
except:
raise ValueError(f"Cannot find resume path in config for {key}!")
elif isinstance(cfg, PretrainedConfig):
return_pths.append(os.path.join(root_path, key[:-4]))
elif isinstance(cfg, str):
return_pths.append(cfg)
return_list = []
for pth in return_pths:
return_list.append(AutoConfig.from_pretrained(pth, trust_remote_code=True))
return return_list
def build_llm_and_tokenizer(
llm_cfg: str,
config: PretrainedConfig,
attn_implementation=None,
model_max_length=None,
*args,
**kwargs,
) -> PreTrainedModel:
llm_arch = getattr(llm_cfg, "architectures")[0].lower()
llm_path = llm_cfg._name_or_path
llm = AutoModelForCausalLM.from_pretrained(
llm_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
)
tokenizer = AutoTokenizer.from_pretrained(
llm_path,
model_max_length=llm_cfg.model_max_length,
padding_side="right",
use_fast=False,
legacy=False,
**kwargs
)
#config.hidden_size = llm.config.hidden_size
return llm, tokenizer
class ApolloForCausalLM(PreTrainedModel):
def __init__(self, config: ApolloConfig, *args, **kwargs):
super().__init__(config)
llm_cfg, vision_tower_cfg, mm_connector_cfg = get_model_config(config)
model_dtype = getattr(config, "model_dtype", "torch.float16")
if not hasattr(config, "model_dtype"):
warnings.warn("model_dtype not found in config, defaulting to torch.float16.")
config.model_dtype = model_dtype
# Initialize weights and apply final processing
self.lm_head = nn.Linear(llm_cfg.hidden_size, config.vocab_size, bias=False)
self.vision_tower = ApolloVisionTower(config, vision_tower_cfg)
self.mm_connector = MMConnector.from_pretrained(mm_connector_cfg._name_or_path)
self.llm, self.tokenizer = build_llm_and_tokenizer(llm_cfg, config, *args, **kwargs)
self.post_init()
self.is_loaded = True
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
vision_input: Optional[List[torch.FloatTensor]] = None,
data_types: Optional[List[str]] = None,
return_dict: Optional[bool] = None,
cache_position=None,
) -> Union[Tuple, CausalLMOutputWithPast]:
if inputs_embeds is None:
(
input_ids,
position_ids,
attention_mask,
past_key_values,
inputs_embeds,
labels
) = self.prepare_inputs_labels_for_multimodal(
input_ids,
position_ids,
attention_mask,
past_key_values,
labels,
vision_input,
data_types
)
return self.get_llm().forward(
input_ids=input_ids,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
labels=labels,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
@torch.no_grad()
def generate(
self,
inputs: Optional[torch.Tensor] = None,
vision_input: Optional[List[torch.Tensor]] = None,
data_types: Optional[List[str]] = None,
**kwargs,
) -> Union[GenerateOutput, torch.LongTensor]:
position_ids = kwargs.pop("position_ids", None)
attention_mask = kwargs.pop("attention_mask", None)
if "inputs_embeds" in kwargs:
raise NotImplementedError("`inputs_embeds` is not supported")
if vision_input is not None:
(inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(
inputs, position_ids, attention_mask, None, None, vision_input, data_types=data_types)
else:
inputs_embeds = self.embed_tokens(inputs)
return self.get_llm().generate(position_ids=position_ids, attention_mask=attention_mask,
inputs_embeds=inputs_embeds, **kwargs)
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
vision_input = kwargs.pop("vision_input", None)
data_types = kwargs.pop("data_types", None)
inputs = self.get_llm().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values,
inputs_embeds=inputs_embeds, **kwargs)
if vision_input is not None:
inputs["vision_input"] = vision_input
if data_types is not None:
inputs["data_types"] = data_types
return inputs
@classmethod
def from_pretrained(
cls,
pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
*model_args,
config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
ignore_mismatched_sizes: bool = False,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = "main",
use_safetensors: bool = None,
**kwargs,
):
return cls.load_pretrained(
pretrained_model_name_or_path,
*model_args,
config=config,
cache_dir=cache_dir,
ignore_mismatched_sizes=ignore_mismatched_sizes,
force_download=force_download,
local_files_only=local_files_only,
token=token,
revision=revision,
use_safetensors=use_safetensors,
**kwargs,
)
def get_llm(self):
return self.llm
def get_vision_tower(self):
return self.vision_tower
def get_mm_connector(self):
return self.mm_connector
@classmethod
def load_pretrained(cls, model_path_or_config, *args, **kwargs):
kwargs.pop("config", None)
if isinstance(model_path_or_config, str):
config = AutoConfig.from_pretrained(model_path_or_config, trust_remote_code=True, **kwargs)
elif isinstance(model_path_or_config, ApolloConfig):
config = model_path_or_config
else:
raise NotImplementedError(f"wrong type, {type(model_path_or_config)} \
{isinstance(model_path_or_config, ApolloConfig)}")
model_dtype = getattr(config, "model_dtype", "torch.float16")
if not hasattr(config, "model_dtype"):
warnings.warn("model_dtype not found in config, defaulting to torch.float16.")
config.model_dtype = model_dtype
with ContextManagers([no_init_weights(_enable=True), ]):
vlm = cls(config, *args, **kwargs)
if hasattr(vlm, "llm") and hasattr(vlm, "vision_tower") and hasattr(vlm, "mm_connector"):
if vlm.is_loaded:
return vlm
else:
print('loading model failed!')
else:
print('loading model failed!')
def _encode_mm(self, x):
x = self.get_vision_tower()(x)
x = self.mm_connector(x)
return x
def encode_mm_minibatch(self, x):
split_sizes = [x_s[0].shape[0] for x_s in x]
x = [torch.split(torch.cat([x_s[i] for x_s in x], dim=0), self.config.encode_batch_size) for i in
range(self.get_vision_tower().num_vision_encoders)]
swapped_x = []
for i in range(len(x[0])):
swapped_x.append([x_s[i] for x_s in x])
features = []
for xx in swapped_x:
xx = self._encode_mm(xx)
features.append(xx)
x = torch.cat(features, dim=0)
x = torch.split(x, split_sizes, dim=0)
return [xx.contiguous().view(-1, xx.shape[2]) for xx in x]
def prepare_inputs_labels_for_multimodal(
self, input_ids, position_ids, attention_mask, past_key_values, labels, vision_input, data_types
):
vision_tower = self.get_vision_tower()
if vision_tower is None or vision_input is None or input_ids.shape[1] == 1:
if (
past_key_values is not None
and vision_tower is not None
and vision_input is not None
and input_ids.shape[1] == 1
):
target_shape = past_key_values[-1][-1].shape[-2] + 1
attention_mask = torch.cat(
(
attention_mask,
torch.ones(
(
attention_mask.shape[0],
target_shape - attention_mask.shape[1],
),
dtype=attention_mask.dtype,
device=attention_mask.device,
),
),
dim=1,
)
position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
return (
input_ids,
position_ids,
attention_mask,
past_key_values,
None,
labels,
)
'''
vision_input is a list of tuples, and data_type is a list of strings:
data_type = ['image', 'video', 'video'..., 'text']
(for one video and two image encoders)
vision_input =
[
[image(1, T, C, H, W), image(1, T, C, H, W), image(1, T, C, H, W)],
[video(Nc1, C, T, H, W), video(Nc1, T, C, H, W), video(Nc1, T, C, H, W)],
[video(Nc2, C, T, H, W), video(Nc2, T, C, H, W), video(Nc2, T, C, H, W)],
]
-> video encoders typlically expect (C,T,H,W), images expect (C,H,W).
'''
# ====================================================================================================
merged_mm_features = self.encode_mm_minibatch(vision_input)
if not getattr(self.config, "tune_language_model", True) and getattr(self.config, "use_mm_start_end", False):
raise NotImplementedError
# ====================================================================================================
# Let's just add dummy tensors if they do not exist,
# it is a headache to deal with None all the time.
# But it is not ideal, and if you have a better idea,
# please open an issue / submit a PR, thanks.
_labels = labels
_position_ids = position_ids
_attention_mask = attention_mask
if attention_mask is None:
attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
else:
attention_mask = attention_mask.bool()
if position_ids is None:
position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
if labels is None:
labels = torch.full_like(input_ids, IGNORE_INDEX)
# remove the padding using attention_mask
input_ids_copy = input_ids.clone()
# kentang-mit@: Otherwise tokenizer out of bounds. Embeddings of image tokens will not be used.
input_ids_copy[input_ids_copy == X_TOKEN_INDEX] = 0
input_embeds = self.get_llm().model.embed_tokens(input_ids_copy)
input_ids = [
cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
]
input_embeds_1 = [
cur_input_embeds[cur_attention_mask]
for cur_input_embeds, cur_attention_mask in zip(input_embeds, attention_mask)
]
labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
# input_ids, new_input_embeds = self.inputs_merger(input_ids, input_embeds_1, merged_mm_features)
new_labels = []
new_input_embeds = []
# print("BEFORE BATCH LOOP:", len(input_ids), input_ids[0].shape, input_ids[0].device, [(x == X_TOKEN_INDEX).sum() for x in input_ids])
# kentang-mit@: If some part of the model is executed in the loop, the the loop length needs to be a constant.
for batch_idx, (cur_labels, cur_input_ids, mm_features) in enumerate(
zip(labels, input_ids, merged_mm_features)):
cur_input_ids = input_ids[batch_idx]
num_mm = (cur_input_ids == X_TOKEN_INDEX).sum()
if num_mm == 0:
cur_input_embeds_1 = input_embeds_1[batch_idx]
cur_input_embeds = torch.cat([cur_input_embeds_1, mm_features[0:0]], dim=0)
new_input_embeds.append(cur_input_embeds)
new_labels.append(cur_labels)
# kenang-mit@: we do not have placeholdr image for text-only data now.
continue
if mm_features.shape[0] != num_mm:
print(data_types[batch_idx])
assert num_mm == len(
mm_features), f'Error in {data_types[batch_idx]}{num_mm}=/={len(mm_features)} not the same number of vision tokens in and vision embeddings!'
cur_input_embeds = input_embeds_1[batch_idx]
image_token_indices = (
[-1] + torch.where(cur_input_ids == X_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
)
cur_input_ids_noim = []
cur_labels = labels[batch_idx]
cur_labels_noim = []
cur_input_embeds_no_im = []
for i in range(len(image_token_indices) - 1):
cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1: image_token_indices[i + 1]])
cur_labels_noim.append(cur_labels[image_token_indices[i] + 1: image_token_indices[i + 1]])
cur_input_embeds_no_im.append(cur_input_embeds[image_token_indices[i] + 1: image_token_indices[i + 1]])
cur_new_input_embeds = []
cur_new_labels = []
for i in range(num_mm + 1):
cur_new_input_embeds.append(cur_input_embeds_no_im[i])
# print("cur_new_input_embeds1", cur_new_input_embeds.shape[-1])
cur_new_labels.append(cur_labels_noim[i])
if i < num_mm:
cur_image_features = mm_features[i:i + 1]
cur_new_input_embeds.append(cur_image_features)
# print("cur_new_input_embeds2", cur_new_input_embeds.shape[-1])
cur_new_labels.append(
torch.full(
(cur_image_features.shape[0],),
IGNORE_INDEX,
device=cur_labels.device,
dtype=cur_labels.dtype,
)
)
cur_new_input_embeds = torch.cat(cur_new_input_embeds)
cur_new_labels = torch.cat(cur_new_labels)
new_input_embeds.append(cur_new_input_embeds)
new_labels.append(cur_new_labels)
# Truncate sequences to max length as image embeddings can make the sequence longer
tokenizer_model_max_length = getattr(self.get_llm().config, "tokenizer_model_max_length", None)
if tokenizer_model_max_length is not None:
if any(len(x) > tokenizer_model_max_length for x in new_input_embeds):
priny("Inputs truncated!")
new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
# Combine them
max_len = max(x.shape[0] for x in new_input_embeds)
batch_size = len(new_input_embeds)
new_input_embeds_padded = []
new_labels_padded = torch.full(
(batch_size, max_len),
IGNORE_INDEX,
dtype=new_labels[0].dtype,
device=new_labels[0].device,
)
attention_mask = torch.zeros(
(batch_size, max_len),
dtype=attention_mask.dtype,
device=attention_mask.device,
)
position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
cur_len = cur_new_embed.shape[0]
if getattr(self.get_llm().config, "tokenizer_padding_side", "right") == "left":
new_input_embeds_padded.append(
torch.cat(
(
torch.zeros(
(max_len - cur_len, cur_new_embed.shape[1]),
dtype=cur_new_embed.dtype,
device=cur_new_embed.device,
),
cur_new_embed,
),
dim=0,
)
)
if cur_len > 0:
new_labels_padded[i, -cur_len:] = cur_new_labels
attention_mask[i, -cur_len:] = True
position_ids[i, -cur_len:] = torch.arange(
0, cur_len, dtype=position_ids.dtype, device=position_ids.device
)
else:
new_input_embeds_padded.append(
torch.cat(
(
cur_new_embed,
torch.zeros(
(max_len - cur_len, cur_new_embed.shape[1]),
dtype=cur_new_embed.dtype,
device=cur_new_embed.device,
),
),
dim=0,
)
)
if cur_len > 0:
new_labels_padded[i, :cur_len] = cur_new_labels
attention_mask[i, :cur_len] = True
position_ids[i, :cur_len] = torch.arange(
0, cur_len, dtype=position_ids.dtype, device=position_ids.device
)
new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
if _labels is None:
new_labels = None
else:
new_labels = new_labels_padded
if _attention_mask is None:
attention_mask = None
else:
attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
if _position_ids is None:
position_ids = None
return (
None,
position_ids,
attention_mask,
past_key_values,
new_input_embeds,
new_labels,
)