MedVersa_Internal / medomni /models /medomni.py

upload everything

cca9b7e 9 months ago

22.6 kB

	import logging
	import random

	import torch
	from torch.cuda.amp import autocast as autocast
	from torchvision import models
	import torch.nn as nn

	from medomni.common.registry import registry
	from medomni.models.blip2 import Blip2Base, disabled_train
	from medomni.models.modeling_llama import LlamaForCausalLM
	from transformers import LlamaTokenizer
	from transformers import SwinModel
	import torch.nn.functional as F
	import math
	from einops import rearrange, repeat
	from einops_exts import rearrange_many
	import open_clip
	import segmentation_models_pytorch as smp
	from medomni.models.UNet import UNet3d
	from huggingface_hub import PyTorchModelHubMixin
	import ipdb
	from peft import (
	get_peft_model,
	LoraConfig,
	PrefixTuningConfig,
	PromptEncoderConfig,
	PromptTuningConfig,
	TaskType,
	)

	class GroupNorm(nn.GroupNorm):
	"""Subclass torch's LayerNorm to handle fp16."""

	def forward(self, x: torch.Tensor):
	orig_type = x.dtype
	ret = super().forward(x.type(torch.float32))
	return ret.type(orig_type)

	class LayerNorm(nn.LayerNorm):
	"""Subclass torch's LayerNorm to handle fp16."""

	def forward(self, x: torch.Tensor):
	orig_type = x.dtype
	ret = super().forward(x.type(torch.float32))
	return ret.type(orig_type)

	def replace_batchnorm_2d(model):
	for name, module in reversed(model._modules.items()):
	if len(list(module.children())) > 0:
	model._modules[name] = replace_batchnorm_2d(module)

	if isinstance(module, nn.BatchNorm2d):
	model._modules[name] = GroupNorm(num_groups=16, num_channels=module.num_features)
	return model

	def dice_loss(input, target):
	input = torch.sigmoid(input)
	smooth = 1.0
	iflat = input.view(-1)
	tflat = target.view(-1)
	intersection = (iflat * tflat).sum()
	return ((2.0 * intersection + smooth) / (iflat.sum() + tflat.sum() + smooth))

	class FocalLoss(nn.Module):
	def __init__(self, gamma):
	super().__init__()
	self.gamma = gamma

	def forward(self, input, target):
	if not (target.size() == input.size()):
	raise ValueError("Target size ({}) must be the same as input size ({})"
	.format(target.size(), input.size()))
	max_val = (-input).clamp(min=0)
	loss = input - input * target + max_val + \
	((-max_val).exp() + (-input - max_val).exp()).log()
	invprobs = F.logsigmoid(-input * (target * 2.0 - 1.0))
	loss = (invprobs * self.gamma).exp() * loss
	return loss.mean()

	class MixedLoss(nn.Module):
	def __init__(self, alpha, gamma):
	super().__init__()
	self.alpha = alpha
	self.focal = FocalLoss(gamma)

	def forward(self, input, target):
	loss = self.alpha*self.focal(input, target) - torch.log(dice_loss(input, target))
	return loss.mean()

	def trans_seg(sample_num, bsz):
	labels = torch.zeros((bsz, 10))
	c_bsz = 0
	for num1 in sample_num:
	num2 = num1.split('-')
	for num3 in num2:
	if num3 != 'n/a':
	c4 = 0
	for num in num3.split(','):
	labels[c_bsz, c4] = float(num)
	c4 += 1
	c_bsz += 1
	return labels

	def trans_det(sample_num, bsz):
	labels = torch.zeros((bsz, 4))
	c_bsz = 0
	for num1 in sample_num:
	num2 = num1.split(';')
	for num3 in num2:
	if num3 != 'n/a':
	c4 = 0
	for num in num3.split(','):
	labels[c_bsz, c4] = float(num)
	c4 += 1
	c_bsz += 1
	return labels

	def trans_keypoint(sample_num, bsz):
	labels = torch.zeros((bsz, 2))
	c_bsz = 0
	for num1 in sample_num:
	num2 = num1.split(';')
	for num3 in num2:
	if num3 != 'n/a':
	c4 = 0
	for num in num3.split(','):
	labels[c_bsz, c4] = float(num)
	c4 += 1
	c_bsz += 1
	return labels

	@registry.register_model("medomni")
	class MedOmni(Blip2Base, PyTorchModelHubMixin):
	PRETRAINED_MODEL_CONFIG_DICT = {
	"medomni": "configs/models/medomni.yaml",
	}
	def __init__(
	self,
	config,
	):
	super().__init__()
	freeze_vit=True
	llama_model=config['llama_model']
	max_txt_len=config['max_txt_len']
	low_resource=False # use 8 bit and put vit in cpu / have not been tested
	end_sym=config['end_sym']
	# self.tokenizer = self.init_tokenizer()
	self.low_resource = low_resource

	print('Loading VIT')
	self.visual_encoder_2d = SwinModel.from_pretrained('microsoft/swin-base-patch4-window7-224')
	self.visual_encoder_3d = UNet3d(in_channels=1, n_classes=1, n_channels=32)
	self.ln_vision_2d = LayerNorm(1024)
	self.ln_vision_3d = LayerNorm(256)

	if freeze_vit:
	for name, param in self.visual_encoder_2d.named_parameters():
	param.requires_grad = False
	self.visual_encoder_2d = self.visual_encoder_2d.eval()
	self.visual_encoder_2d.train = disabled_train
	for name, param in self.ln_vision_2d.named_parameters():
	param.requires_grad = False
	self.ln_vision_2d = self.ln_vision_2d.eval()
	self.ln_vision_2d.train = disabled_train
	for name, param in self.visual_encoder_3d.named_parameters():
	param.requires_grad = False
	self.visual_encoder_3d = self.visual_encoder_3d.eval()
	self.visual_encoder_3d.train = disabled_train
	for name, param in self.ln_vision_3d.named_parameters():
	param.requires_grad = False
	self.ln_vision_3d = self.ln_vision_3d.eval()
	self.ln_vision_3d.train = disabled_train
	logging.info("freeze vision encoder")
	print('Loading VIT Done')

	print('Loading LLAMA')
	self.llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model, legacy=False, use_fast=False)
	special_token = {}
	special_token["additional_special_tokens"] = ['<ImageHere>']
	self.llama_tokenizer.add_special_tokens(
	special_token
	)
	self.llama_tokenizer.add_tokens("<DET>")
	self.llama_tokenizer.add_tokens("<2DSEG>")
	self.llama_tokenizer.add_tokens("<3DSEG>")
	# self.llama_tokenizer.add_tokens("<2DPOINT>")
	self.llama_tokenizer.add_tokens("<N/A>")
	self.det_token_idx = self.llama_tokenizer("<DET>", add_special_tokens=False).input_ids[0]
	self.seg_token_idx_2d = self.llama_tokenizer("<2DSEG>", add_special_tokens=False).input_ids[0]
	self.seg_token_idx_3d = self.llama_tokenizer("<3DSEG>", add_special_tokens=False).input_ids[0]
	# self.point_token_idx_2d = self.llama_tokenizer("<2DPOINT>", add_special_tokens=False).input_ids[0]
	self.na_token_idx = self.llama_tokenizer("<N/A>", add_special_tokens=False).input_ids[0]
	self.llama_tokenizer.pad_token = 0

	if self.low_resource:
	self.llama_model = LlamaForCausalLM.from_pretrained(
	llama_model,
	torch_dtype=torch.bfloat16,
	load_in_8bit=True,
	device_map="auto"
	)
	else:
	self.llama_model = LlamaForCausalLM.from_pretrained(
	llama_model,
	torch_dtype=torch.bfloat16,
	)

	self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
	self.embed_tokens = self.llama_model.get_input_embeddings()
	self.embed_states = self.llama_model.get_output_embeddings() # cannot remove
	# ---LoRA---
	class CastOutputToFloat(nn.Sequential):
	def forward(self, x): return super().forward(x).to(torch.bfloat16)
	self.llama_model.lm_head = CastOutputToFloat(self.llama_model.lm_head)
	# ---LoRA---

	print("Setup PEFT")
	peft_config = LoraConfig(
	task_type="CAUSAL_LM", inference_mode=False,
	r=16,
	lora_alpha=16, lora_dropout=0.1,
	target_modules=['q_proj', 'v_proj']
	) # 8 32 hyz 9.21
	self.llama_model = get_peft_model(self.llama_model, peft_config)
	self.llama_proj_2d = nn.Linear(1024, self.llama_model.config.hidden_size)
	self.llama_proj_3d = nn.Linear(256, self.llama_model.config.hidden_size)

	# # Detection
	text_det = nn.Sequential(
	LayerNorm(self.llama_model.config.hidden_size),
	nn.Linear(self.llama_model.config.hidden_size, 256),
	nn.ReLU(inplace=True),
	LayerNorm(256),
	nn.Linear(256, 4),
	)
	self.text_det = text_det
	self.det_loss = torch.nn.SmoothL1Loss()

	# # Keypoint
	# text_point = nn.Sequential(
	# LayerNorm(self.llama_model.config.hidden_size),
	# nn.Linear(self.llama_model.config.hidden_size, 256),
	# nn.ReLU(inplace=True),
	# LayerNorm(256),
	# nn.Linear(256, 2),
	# )
	# self.text_point = text_point
	# self.keypoint_loss = torch.nn.SmoothL1Loss()

	# Segmentation
	self.model_seg_2d = smp.Unet(encoder_name="resnet18", encoder_weights="imagenet", in_channels=3, classes=1)
	self.model_seg_2d = replace_batchnorm_2d(self.model_seg_2d) # GN is much better than BN

	text2seg_2d = nn.Sequential(
	LayerNorm(self.llama_model.config.hidden_size),
	nn.Linear(self.llama_model.config.hidden_size, 512),
	)
	self.text2seg_2d = text2seg_2d
	self.text2seg_2d_ln = LayerNorm(512)
	self.text2seg_2d_gn = GroupNorm(16, 512)
	text2seg_3d = nn.Sequential(
	LayerNorm(self.llama_model.config.hidden_size),
	nn.Linear(self.llama_model.config.hidden_size, 256),
	)
	self.text2seg_3d = text2seg_3d
	self.text2seg_3d_ln = LayerNorm(256)
	self.text2seg_3d_gn = GroupNorm(16, 256)
	self.seg_loss = MixedLoss(10.0, 2.0)

	self.max_txt_len = max_txt_len
	self.end_sym = end_sym
	self.prompt_list = []

	def vit_to_cpu(self):
	self.ln_vision.to("cpu")
	self.ln_vision.float()
	self.visual_encoder.to("cpu")
	self.visual_encoder.float()

	def encode_img(self, image, modals, task_types=[]):
	B,S,_,_,_ = image.shape
	device = image.device
	image_embeds_list = None
	if self.low_resource:
	self.vit_to_cpu()
	image = image.to("cpu")

	with self.maybe_autocast():
	if 'ct' in modals:
	image_embeds_list = self.visual_encoder_3d(image, encoder_only=True)
	image_embeds_list = [_.to(device) for _ in image_embeds_list]
	image_embeds = image_embeds_list[-1].detach()
	image_embeds = F.adaptive_avg_pool3d(image_embeds, (1, 3, 3)).view(B, image_embeds.shape[1], -1).permute(0, 2, 1)
	inputs_llama = self.llama_proj_3d(self.ln_vision_3d(image_embeds))
	inputs_llama = rearrange(inputs_llama, "(b s) c d -> b s c d", b=B, s=S).to(torch.bfloat16)
	atts_llama = torch.ones(inputs_llama.size()[:-2], dtype=torch.long).to(image.device)
	else:
	image = rearrange(image, "b s c h w -> (b s) c h w")
	image_embeds = self.visual_encoder_2d(image)['last_hidden_state'].to(device)
	image_embeds_unp = image_embeds.permute(0, 2, 1).view(B*S,-1,7,7)
	image_embeds_unp = F.adaptive_avg_pool2d(image_embeds_unp, (3, 3))
	image_embeds = image_embeds_unp.view(B*S, -1, 9).permute(0, 2, 1)
	inputs_llama = self.llama_proj_2d(self.ln_vision_2d(image_embeds))
	if 'segmentation' not in task_types:
	inputs_llama = rearrange(inputs_llama, "(b s) c d -> b s c d", b=B, s=S).to(torch.bfloat16)
	atts_llama = torch.ones(inputs_llama.size()[:-2], dtype=torch.long).to(image.device)
	else:
	inputs_llama = rearrange(inputs_llama, "(b s) c d -> b s c d", b=B, s=S).to(torch.bfloat16).detach() # add detach() for segmentation tasks
	atts_llama = torch.ones(inputs_llama.size()[:-2], dtype=torch.long).to(image.device).detach()

	return inputs_llama, atts_llama, image_embeds_list

	def prompt_concat(self, img_embeds, atts_img, prompt):
	if prompt:
	batch_size = img_embeds.shape[0]
	p_after_embeds = self.embed_tokens(prompt.input_ids).expand(batch_size, -1, -1)
	wrapped_img_embeds = torch.cat([img_embeds, p_after_embeds], dim=1)
	wrapped_atts_img = atts_img[:, :1].expand(-1, wrapped_img_embeds.shape[1])
	return wrapped_img_embeds, wrapped_atts_img
	else:
	return img_embeds, atts_img

	def prompt_wrap(self, img_embeds, atts_img, prompt_list, num_imgs, seg=None):
	bsz = img_embeds.shape[0]
	if prompt_list:
	img_idx = ([], [])
	for i in range(len(num_imgs)):
	for j in range(num_imgs[i]):
	img_idx[0].append(i)
	img_idx[1].append(j)
	prompt_tokens = self.llama_tokenizer(prompt_list, return_tensors="pt", padding="longest", truncation=True, max_length=256).to(img_embeds.device)
	idx = (prompt_tokens.input_ids == 32000).nonzero(as_tuple=True)
	prompt_tokens.input_ids[idx] = 123 # avoid memory issue
	p_embeds = self.embed_tokens(prompt_tokens.input_ids).expand(bsz, -1, -1)
	if seg is None:
	p_embeds[idx] = rearrange(img_embeds[img_idx], "b c d -> (b c) d").to(torch.bfloat16)
	else:
	p_embeds[idx] = rearrange(img_embeds[img_idx], "b c d -> (b c) d").to(torch.bfloat16).detach()
	return p_embeds, atts_img
	else:
	return img_embeds, atts_img

	def forward(self, samples):
	image = samples["image"]
	bsz = image.shape[0]
	img_embeds, atts_img, img_embeds_list = self.encode_img(image, samples['modal'], samples['task_type'])
	prefix_list = []
	tag_list = [[] for _ in range(bsz)]
	placeholder = ['<ImageHere>'] * 9 # 9 = the number of visual tokens
	for j in range(bsz):
	num = samples['num_imgs'][j]
	prefix = '' # Can add some prompt, such as 'You will be given an image, please describe everything you see'
	for i in range(num):
	prefix += '<img' + str(i) + '>' + ''.join(x for x in placeholder) + '</img' + str(i) + '>'
	tag_list[j].append('<img' + str(i) + '>')
	prefix_list.append('###Human:' + prefix)
	img_embeds, atts_img = self.prompt_wrap(img_embeds, atts_img, prefix_list, samples['num_imgs'], seg = None if 'segmentation' not in samples['task_type'] else 'yes')
	self.llama_tokenizer.padding_side = "right"

	prompt = [t for t in samples['question']]
	for i in range(len(prompt)):
	tags = ''
	for tag in tag_list[i]:
	if tag not in prompt[i]:
	tags += tag
	prompt[i] = prompt[i].replace('_*_', tags)

	if 'detection' in samples['task_type'] or 'keypoint' in samples['task_type']:
	sample_ans = [ans.split('\|\|\|')[0] for ans in samples['answer']]
	sample_num = [ans.split('\|\|\|')[1] for ans in samples['answer']]
	else:
	sample_ans = samples['answer']
	text = ['###Assistant: ' + str(t) + self.end_sym for t in sample_ans]

	prompt_tokens = self.llama_tokenizer(
	prompt,
	return_tensors="pt",
	padding='longest',
	truncation=True,
	max_length=256,
	add_special_tokens=False
	).to(image.device)

	img_embeds, atts_img = self.prompt_concat(img_embeds, atts_img, prompt_tokens)

	to_regress_tokens = self.llama_tokenizer(
	text,
	return_tensors="pt",
	padding="longest",
	truncation=True,
	max_length=self.max_txt_len,
	add_special_tokens=False
	).to(image.device)

	targets = to_regress_tokens.input_ids.masked_fill(
	to_regress_tokens.input_ids == self.llama_tokenizer.pad_token_id, -100
	)

	empty_targets = (
	torch.ones([atts_img.shape[0], atts_img.shape[1]+1],
	dtype=torch.long).to(image.device).fill_(-100) # plus one for bos
	)
	targets = torch.cat([empty_targets, targets], dim=1)

	batch_size = img_embeds.shape[0]
	bos = torch.ones([batch_size, 1],
	dtype=to_regress_tokens.input_ids.dtype,
	device=to_regress_tokens.input_ids.device) * self.llama_tokenizer.bos_token_id

	bos_embeds = self.embed_tokens(bos)
	atts_bos = atts_img[:, :1]

	to_regress_embeds = self.embed_tokens(to_regress_tokens.input_ids)
	inputs_embeds = torch.cat([bos_embeds, img_embeds, to_regress_embeds], dim=1)
	attention_mask = torch.cat([atts_bos, atts_img, to_regress_tokens.attention_mask], dim=1)
	with self.maybe_autocast():
	outputs = self.llama_model(
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	return_dict=True,
	labels=targets,
	output_hidden_states=True,
	)
	loss = outputs.loss

	if 'detection' in samples['task_type']:
	with self.maybe_autocast():
	hidden_states = outputs.hidden_states[-1]
	token_mask = targets == self.det_token_idx
	target_states = hidden_states[token_mask]
	with self.maybe_autocast():
	det_states = self.text_det(target_states)
	labels = trans_det(sample_num, det_states.shape[0])
	labels = labels.to(targets.device)
	det_loss = self.det_loss(det_states, labels)
	loss += det_loss * 1e2

	if 'keypoint' in samples['task_type']:
	with self.maybe_autocast():
	hidden_states = outputs.hidden_states[-1]
	token_mask = targets == self.point_token_idx_2d
	target_states = hidden_states[token_mask]
	with self.maybe_autocast():
	point_states = self.text_point(target_states)
	labels = trans_keypoint(sample_num, point_states.shape[0])
	labels = labels.to(targets.device)
	keypoint_loss = self.keypoint_loss(point_states, labels)
	loss += keypoint_loss * 1e2

	if 'segmentation' in samples['task_type']:
	if 'ct' in samples['modal']:
	masks = samples['answer_img']
	with self.maybe_autocast():
	img_embeds_list = self.visual_encoder_3d(image, encoder_only=True)
	img_embeds_list = [_.to(targets.device) for _ in img_embeds_list]
	hidden_states = outputs.hidden_states[-1]
	token_mask = targets == self.seg_token_idx_3d
	target_states = hidden_states[token_mask]
	seg_states = self.text2seg_3d(target_states)
	last_feats = img_embeds_list[-1]
	last_feats = last_feats + seg_states.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
	last_feats = self.text2seg_3d_gn(last_feats)
	img_embeds_list[-1] = last_feats
	seg_preds = self.visual_encoder_3d(encoder_only=False, x_=img_embeds_list)
	loss += self.seg_loss(seg_preds, masks.float()) # +
	else:
	masks = samples['answer_img']
	with self.maybe_autocast():
	feats = self.model_seg_2d.encoder(image[:,0])
	last_feats = feats[-1]
	hidden_states = outputs.hidden_states[-1]
	token_mask = targets == self.seg_token_idx_2d
	target_states = hidden_states[token_mask]
	seg_states = self.text2seg_2d(target_states)
	last_feats = last_feats+seg_states.unsqueeze(-1).unsqueeze(-1)
	last_feats = self.text2seg_2d_gn(last_feats)
	feats[-1] = last_feats
	seg_feats = self.model_seg_2d.decoder(*feats)
	seg_preds = self.model_seg_2d.segmentation_head(seg_feats)
	loss += self.seg_loss(seg_preds, masks.float())

	return {"loss": loss, "modal": samples['modal'][0], "task_type": samples['task_type'][0]}

	@classmethod
	def from_config(cls, cfg, finetune=False):
	# llama_model = cfg.get("llama_model")
	# freeze_vit = cfg.get("freeze_vit", True)
	# low_resource = cfg.get("low_resource", False)
	# max_txt_len = cfg.get("max_txt_len", 32)
	# end_sym = cfg.get("end_sym", '\n')
	# ipdb.set_trace()

	# model = cls(
	# freeze_vit=freeze_vit,
	# llama_model=llama_model,
	# max_txt_len=max_txt_len,
	# low_resource=low_resource,
	# end_sym=end_sym
	# )
	model = cls(cfg)

	# load checkpoint
	ckpt_path = cfg.get("ckpt", "")
	if ckpt_path:
	print("Load Checkpoint: {}".format(ckpt_path))
	ckpt = torch.load(ckpt_path, map_location="cpu")
	if finetune:
	current_model_dict = model.state_dict()
	weights = ckpt['model']
	new_state_dict = {}
	for k in list(current_model_dict.keys()):
	if k in list(weights.keys()):
	if weights[k].size() == current_model_dict[k].size():
	new_state_dict[k] = weights[k]
	else:
	new_state_dict[k] = current_model_dict[k]
	else:
	print(k)
	new_state_dict[k] = current_model_dict[k]
	msg = model.load_state_dict(new_state_dict, strict=False)
	else:
	msg = model.load_state_dict(ckpt['model'], strict=False)

	return model