File size: 22,551 Bytes

cca9b7e

import logging
import random

import torch
from torch.cuda.amp import autocast as autocast
from torchvision import models
import torch.nn as nn

from medomni.common.registry import registry
from medomni.models.blip2 import Blip2Base, disabled_train
from medomni.models.modeling_llama import LlamaForCausalLM
from transformers import LlamaTokenizer
from transformers import SwinModel
import torch.nn.functional as F
import math
from einops import rearrange, repeat
from einops_exts import rearrange_many
import open_clip
import segmentation_models_pytorch as smp
from medomni.models.UNet import UNet3d
from huggingface_hub import PyTorchModelHubMixin
import ipdb
from peft import (
    get_peft_model,
    LoraConfig,
    PrefixTuningConfig,
    PromptEncoderConfig,
    PromptTuningConfig,
    TaskType,
)

class GroupNorm(nn.GroupNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)

class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)

def replace_batchnorm_2d(model):
    for name, module in reversed(model._modules.items()):
        if len(list(module.children())) > 0:
            model._modules[name] = replace_batchnorm_2d(module)
        
        if isinstance(module, nn.BatchNorm2d):
            model._modules[name] = GroupNorm(num_groups=16, num_channels=module.num_features)
    return model

def dice_loss(input, target):
    input = torch.sigmoid(input)
    smooth = 1.0
    iflat = input.view(-1)
    tflat = target.view(-1)
    intersection = (iflat * tflat).sum()
    return ((2.0 * intersection + smooth) / (iflat.sum() + tflat.sum() + smooth))

class FocalLoss(nn.Module):
    def __init__(self, gamma):
        super().__init__()
        self.gamma = gamma

    def forward(self, input, target):
        if not (target.size() == input.size()):
            raise ValueError("Target size ({}) must be the same as input size ({})"
                             .format(target.size(), input.size()))
        max_val = (-input).clamp(min=0)
        loss = input - input * target + max_val + \
            ((-max_val).exp() + (-input - max_val).exp()).log()
        invprobs = F.logsigmoid(-input * (target * 2.0 - 1.0))
        loss = (invprobs * self.gamma).exp() * loss
        return loss.mean()

class MixedLoss(nn.Module):
    def __init__(self, alpha, gamma):
        super().__init__()
        self.alpha = alpha
        self.focal = FocalLoss(gamma)

    def forward(self, input, target):
        loss = self.alpha*self.focal(input, target) - torch.log(dice_loss(input, target))
        return loss.mean()

def trans_seg(sample_num, bsz):
    labels = torch.zeros((bsz, 10))
    c_bsz = 0
    for num1 in sample_num:
        num2 = num1.split('-')
        for num3 in num2:
            if num3 != 'n/a':
                c4 = 0
                for num in num3.split(','):
                    labels[c_bsz, c4] = float(num)
                    c4 += 1
                c_bsz += 1
    return labels

def trans_det(sample_num, bsz):
    labels = torch.zeros((bsz, 4))
    c_bsz = 0
    for num1 in sample_num:
        num2 = num1.split(';')
        for num3 in num2:
            if num3 != 'n/a':
                c4 = 0
                for num in num3.split(','):
                    labels[c_bsz, c4] = float(num)
                    c4 += 1
                c_bsz += 1
    return labels

def trans_keypoint(sample_num, bsz):
    labels = torch.zeros((bsz, 2))
    c_bsz = 0
    for num1 in sample_num:
        num2 = num1.split(';')
        for num3 in num2:
            if num3 != 'n/a':
                c4 = 0
                for num in num3.split(','):
                    labels[c_bsz, c4] = float(num)
                    c4 += 1
                c_bsz += 1
    return labels

@registry.register_model("medomni")
class MedOmni(Blip2Base, PyTorchModelHubMixin):
    PRETRAINED_MODEL_CONFIG_DICT = {
        "medomni": "configs/models/medomni.yaml",
    }
    def __init__(
        self,
        config,
    ):
        super().__init__()
        freeze_vit=True
        llama_model=config['llama_model']
        max_txt_len=config['max_txt_len']
        low_resource=False  # use 8 bit and put vit in cpu / have not been tested
        end_sym=config['end_sym']
        # self.tokenizer = self.init_tokenizer()
        self.low_resource = low_resource

        print('Loading VIT')
        self.visual_encoder_2d = SwinModel.from_pretrained('microsoft/swin-base-patch4-window7-224')
        self.visual_encoder_3d = UNet3d(in_channels=1, n_classes=1, n_channels=32)
        self.ln_vision_2d = LayerNorm(1024)
        self.ln_vision_3d = LayerNorm(256)

        if freeze_vit:
            for name, param in self.visual_encoder_2d.named_parameters():
                param.requires_grad = False
            self.visual_encoder_2d = self.visual_encoder_2d.eval()
            self.visual_encoder_2d.train = disabled_train
            for name, param in self.ln_vision_2d.named_parameters():
                param.requires_grad = False
            self.ln_vision_2d = self.ln_vision_2d.eval()
            self.ln_vision_2d.train = disabled_train
            for name, param in self.visual_encoder_3d.named_parameters():
                param.requires_grad = False
            self.visual_encoder_3d = self.visual_encoder_3d.eval()
            self.visual_encoder_3d.train = disabled_train
            for name, param in self.ln_vision_3d.named_parameters():
                param.requires_grad = False
            self.ln_vision_3d = self.ln_vision_3d.eval()
            self.ln_vision_3d.train = disabled_train
            logging.info("freeze vision encoder")
        print('Loading VIT Done')

        print('Loading LLAMA')
        self.llama_tokenizer = LlamaTokenizer.from_pretrained(llama_model, legacy=False, use_fast=False)
        special_token = {}
        special_token["additional_special_tokens"] = ['<ImageHere>']
        self.llama_tokenizer.add_special_tokens(
            special_token
        )
        self.llama_tokenizer.add_tokens("<DET>")
        self.llama_tokenizer.add_tokens("<2DSEG>")
        self.llama_tokenizer.add_tokens("<3DSEG>")
        # self.llama_tokenizer.add_tokens("<2DPOINT>")
        self.llama_tokenizer.add_tokens("<N/A>")
        self.det_token_idx = self.llama_tokenizer("<DET>", add_special_tokens=False).input_ids[0]
        self.seg_token_idx_2d = self.llama_tokenizer("<2DSEG>", add_special_tokens=False).input_ids[0]
        self.seg_token_idx_3d = self.llama_tokenizer("<3DSEG>", add_special_tokens=False).input_ids[0]
        # self.point_token_idx_2d = self.llama_tokenizer("<2DPOINT>", add_special_tokens=False).input_ids[0]
        self.na_token_idx = self.llama_tokenizer("<N/A>", add_special_tokens=False).input_ids[0]
        self.llama_tokenizer.pad_token = 0

        if self.low_resource:
            self.llama_model = LlamaForCausalLM.from_pretrained(
                llama_model,
                torch_dtype=torch.bfloat16,
                load_in_8bit=True,
                device_map="auto"
            )
        else:
            self.llama_model = LlamaForCausalLM.from_pretrained(
                llama_model,
                torch_dtype=torch.bfloat16,
            )

        self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
        self.embed_tokens = self.llama_model.get_input_embeddings()
        self.embed_states = self.llama_model.get_output_embeddings() # cannot remove
        # ---LoRA---
        class CastOutputToFloat(nn.Sequential):
            def forward(self, x): return super().forward(x).to(torch.bfloat16)
        self.llama_model.lm_head = CastOutputToFloat(self.llama_model.lm_head)
        # ---LoRA---

        print("Setup PEFT")
        peft_config = LoraConfig(
            task_type="CAUSAL_LM", inference_mode=False,
            r=16,
            lora_alpha=16, lora_dropout=0.1, 
            target_modules=['q_proj', 'v_proj']
        ) # 8 32 hyz 9.21
        self.llama_model = get_peft_model(self.llama_model, peft_config)
        self.llama_proj_2d = nn.Linear(1024, self.llama_model.config.hidden_size)
        self.llama_proj_3d = nn.Linear(256, self.llama_model.config.hidden_size)

        # # Detection
        text_det = nn.Sequential(
            LayerNorm(self.llama_model.config.hidden_size),
            nn.Linear(self.llama_model.config.hidden_size, 256),
            nn.ReLU(inplace=True),
            LayerNorm(256),
            nn.Linear(256, 4),
        )
        self.text_det = text_det
        self.det_loss = torch.nn.SmoothL1Loss()

        # # Keypoint
        # text_point = nn.Sequential(
        #     LayerNorm(self.llama_model.config.hidden_size),
        #     nn.Linear(self.llama_model.config.hidden_size, 256),
        #     nn.ReLU(inplace=True),
        #     LayerNorm(256),
        #     nn.Linear(256, 2),
        # )
        # self.text_point = text_point
        # self.keypoint_loss = torch.nn.SmoothL1Loss()

        # Segmentation
        self.model_seg_2d = smp.Unet(encoder_name="resnet18", encoder_weights="imagenet", in_channels=3, classes=1)
        self.model_seg_2d = replace_batchnorm_2d(self.model_seg_2d) # GN is much better than BN

        text2seg_2d = nn.Sequential(
            LayerNorm(self.llama_model.config.hidden_size),
            nn.Linear(self.llama_model.config.hidden_size, 512),
        )
        self.text2seg_2d = text2seg_2d
        self.text2seg_2d_ln = LayerNorm(512)
        self.text2seg_2d_gn = GroupNorm(16, 512)
        text2seg_3d = nn.Sequential(
            LayerNorm(self.llama_model.config.hidden_size),
            nn.Linear(self.llama_model.config.hidden_size, 256),
        )
        self.text2seg_3d = text2seg_3d
        self.text2seg_3d_ln = LayerNorm(256)
        self.text2seg_3d_gn = GroupNorm(16, 256)
        self.seg_loss = MixedLoss(10.0, 2.0)

        self.max_txt_len = max_txt_len
        self.end_sym = end_sym
        self.prompt_list = []

    def vit_to_cpu(self):
        self.ln_vision.to("cpu")
        self.ln_vision.float()
        self.visual_encoder.to("cpu")
        self.visual_encoder.float()

    def encode_img(self, image, modals, task_types=[]):
        B,S,_,_,_ = image.shape
        device = image.device
        image_embeds_list = None
        if self.low_resource:
            self.vit_to_cpu()
            image = image.to("cpu")

        with self.maybe_autocast():
            if 'ct' in modals:
                image_embeds_list = self.visual_encoder_3d(image, encoder_only=True)
                image_embeds_list = [_.to(device) for _ in image_embeds_list]
                image_embeds = image_embeds_list[-1].detach()
                image_embeds = F.adaptive_avg_pool3d(image_embeds, (1, 3, 3)).view(B, image_embeds.shape[1], -1).permute(0, 2, 1)
                inputs_llama = self.llama_proj_3d(self.ln_vision_3d(image_embeds))
                inputs_llama = rearrange(inputs_llama, "(b s) c d -> b s c d", b=B, s=S).to(torch.bfloat16)
                atts_llama = torch.ones(inputs_llama.size()[:-2], dtype=torch.long).to(image.device)
            else:
                image = rearrange(image, "b s c h w -> (b s) c h w")
                image_embeds = self.visual_encoder_2d(image)['last_hidden_state'].to(device)
                image_embeds_unp = image_embeds.permute(0, 2, 1).view(B*S,-1,7,7)
                image_embeds_unp = F.adaptive_avg_pool2d(image_embeds_unp, (3, 3))
                image_embeds = image_embeds_unp.view(B*S, -1, 9).permute(0, 2, 1)
                inputs_llama = self.llama_proj_2d(self.ln_vision_2d(image_embeds))
                if 'segmentation' not in task_types:
                    inputs_llama = rearrange(inputs_llama, "(b s) c d -> b s c d", b=B, s=S).to(torch.bfloat16)
                    atts_llama = torch.ones(inputs_llama.size()[:-2], dtype=torch.long).to(image.device)
                else:
                    inputs_llama = rearrange(inputs_llama, "(b s) c d -> b s c d", b=B, s=S).to(torch.bfloat16).detach() # add detach() for segmentation tasks
                    atts_llama = torch.ones(inputs_llama.size()[:-2], dtype=torch.long).to(image.device).detach()
         
        return inputs_llama, atts_llama, image_embeds_list

    def prompt_concat(self, img_embeds, atts_img, prompt):
        if prompt:
            batch_size = img_embeds.shape[0]
            p_after_embeds = self.embed_tokens(prompt.input_ids).expand(batch_size, -1, -1)
            wrapped_img_embeds = torch.cat([img_embeds, p_after_embeds], dim=1)
            wrapped_atts_img = atts_img[:, :1].expand(-1, wrapped_img_embeds.shape[1])                
            return wrapped_img_embeds, wrapped_atts_img
        else:
            return img_embeds, atts_img

    def prompt_wrap(self, img_embeds, atts_img, prompt_list, num_imgs, seg=None):
        bsz = img_embeds.shape[0]
        if prompt_list:
            img_idx = ([], [])
            for i in range(len(num_imgs)):
                for j in range(num_imgs[i]):
                    img_idx[0].append(i)
                    img_idx[1].append(j)
            prompt_tokens = self.llama_tokenizer(prompt_list, return_tensors="pt", padding="longest", truncation=True, max_length=256).to(img_embeds.device)
            idx = (prompt_tokens.input_ids == 32000).nonzero(as_tuple=True)
            prompt_tokens.input_ids[idx] = 123 # avoid memory issue
            p_embeds = self.embed_tokens(prompt_tokens.input_ids).expand(bsz, -1, -1)
            if seg is None:
                p_embeds[idx] = rearrange(img_embeds[img_idx], "b c d -> (b c) d").to(torch.bfloat16)
            else:
                p_embeds[idx] = rearrange(img_embeds[img_idx], "b c d -> (b c) d").to(torch.bfloat16).detach()
            return p_embeds, atts_img
        else:
            return img_embeds, atts_img

    def forward(self, samples):
        image = samples["image"]
        bsz = image.shape[0]
        img_embeds, atts_img, img_embeds_list = self.encode_img(image, samples['modal'], samples['task_type'])
        prefix_list = []
        tag_list = [[] for _ in range(bsz)]
        placeholder = ['<ImageHere>'] * 9 # 9 = the number of visual tokens
        for j in range(bsz):
            num = samples['num_imgs'][j]
            prefix = '' # Can add some prompt, such as 'You will be given an image, please describe everything you see' 
            for i in range(num):
                prefix += '<img' + str(i) + '>' + ''.join(x for x in placeholder) + '</img' + str(i) + '>' 
                tag_list[j].append('<img' + str(i) + '>')
            prefix_list.append('###Human:' + prefix)
        img_embeds, atts_img = self.prompt_wrap(img_embeds, atts_img, prefix_list, samples['num_imgs'], seg = None if 'segmentation' not in samples['task_type'] else 'yes')
        self.llama_tokenizer.padding_side = "right"

        prompt = [t for t in samples['question']]
        for i in range(len(prompt)):
            tags = ''
            for tag in tag_list[i]:
                if tag not in prompt[i]:
                    tags += tag
            prompt[i] = prompt[i].replace('_*_', tags)
        
        if 'detection' in samples['task_type'] or 'keypoint' in samples['task_type']:
            sample_ans = [ans.split('|||')[0] for ans in samples['answer']]
            sample_num = [ans.split('|||')[1] for ans in samples['answer']]
        else:
            sample_ans = samples['answer']
        text = ['###Assistant: ' + str(t) + self.end_sym for t in sample_ans]

        prompt_tokens = self.llama_tokenizer(
            prompt,
            return_tensors="pt",
            padding='longest',
            truncation=True,
            max_length=256,
            add_special_tokens=False
        ).to(image.device)

        img_embeds, atts_img = self.prompt_concat(img_embeds, atts_img, prompt_tokens) 

        to_regress_tokens = self.llama_tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            truncation=True,
            max_length=self.max_txt_len,
            add_special_tokens=False
        ).to(image.device)

        targets = to_regress_tokens.input_ids.masked_fill(
            to_regress_tokens.input_ids == self.llama_tokenizer.pad_token_id, -100
        )

        empty_targets = (
            torch.ones([atts_img.shape[0], atts_img.shape[1]+1],
                       dtype=torch.long).to(image.device).fill_(-100)  # plus one for bos
        )
        targets = torch.cat([empty_targets, targets], dim=1)

        batch_size = img_embeds.shape[0]
        bos = torch.ones([batch_size, 1],
                         dtype=to_regress_tokens.input_ids.dtype,
                         device=to_regress_tokens.input_ids.device) * self.llama_tokenizer.bos_token_id

        bos_embeds = self.embed_tokens(bos)
        atts_bos = atts_img[:, :1]

        to_regress_embeds = self.embed_tokens(to_regress_tokens.input_ids)
        inputs_embeds = torch.cat([bos_embeds, img_embeds, to_regress_embeds], dim=1)
        attention_mask = torch.cat([atts_bos, atts_img, to_regress_tokens.attention_mask], dim=1)
        with self.maybe_autocast():
            outputs = self.llama_model(
                inputs_embeds=inputs_embeds,
                attention_mask=attention_mask,
                return_dict=True,
                labels=targets,
                output_hidden_states=True,
            )
        loss = outputs.loss

        if 'detection' in samples['task_type']:
            with self.maybe_autocast():
                hidden_states = outputs.hidden_states[-1]
                token_mask = targets == self.det_token_idx
                target_states = hidden_states[token_mask]
                with self.maybe_autocast():
                    det_states = self.text_det(target_states)
                labels = trans_det(sample_num, det_states.shape[0])
                labels = labels.to(targets.device)
                det_loss = self.det_loss(det_states, labels)
                loss += det_loss * 1e2

        if 'keypoint' in samples['task_type']:
            with self.maybe_autocast():
                hidden_states = outputs.hidden_states[-1]
                token_mask = targets == self.point_token_idx_2d
                target_states = hidden_states[token_mask]
                with self.maybe_autocast():
                    point_states = self.text_point(target_states)
                labels = trans_keypoint(sample_num, point_states.shape[0])
                labels = labels.to(targets.device)
                keypoint_loss = self.keypoint_loss(point_states, labels)
                loss += keypoint_loss * 1e2

        if 'segmentation' in samples['task_type']:
            if 'ct' in samples['modal']:
                masks = samples['answer_img']
                with self.maybe_autocast():
                    img_embeds_list = self.visual_encoder_3d(image, encoder_only=True)
                    img_embeds_list = [_.to(targets.device) for _ in img_embeds_list]
                    hidden_states = outputs.hidden_states[-1]
                    token_mask = targets == self.seg_token_idx_3d
                    target_states = hidden_states[token_mask]   
                    seg_states = self.text2seg_3d(target_states)
                    last_feats = img_embeds_list[-1]
                    last_feats = last_feats + seg_states.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
                    last_feats = self.text2seg_3d_gn(last_feats)
                    img_embeds_list[-1] = last_feats
                    seg_preds = self.visual_encoder_3d(encoder_only=False, x_=img_embeds_list)
                    loss += self.seg_loss(seg_preds, masks.float()) # +
            else:  
                masks = samples['answer_img']
                with self.maybe_autocast():
                    feats = self.model_seg_2d.encoder(image[:,0])
                    last_feats = feats[-1]
                    hidden_states = outputs.hidden_states[-1]
                    token_mask = targets == self.seg_token_idx_2d
                    target_states = hidden_states[token_mask]
                    seg_states = self.text2seg_2d(target_states)
                    last_feats = last_feats+seg_states.unsqueeze(-1).unsqueeze(-1)
                    last_feats = self.text2seg_2d_gn(last_feats)
                    feats[-1] = last_feats
                    seg_feats = self.model_seg_2d.decoder(*feats)
                    seg_preds = self.model_seg_2d.segmentation_head(seg_feats)
                    loss += self.seg_loss(seg_preds, masks.float())

        return {"loss": loss, "modal": samples['modal'][0], "task_type": samples['task_type'][0]}

    @classmethod
    def from_config(cls, cfg, finetune=False):
        # llama_model = cfg.get("llama_model")
        # freeze_vit = cfg.get("freeze_vit", True)
        # low_resource = cfg.get("low_resource", False)
        # max_txt_len = cfg.get("max_txt_len", 32)
        # end_sym = cfg.get("end_sym", '\n')
        # ipdb.set_trace()

        # model = cls(
        #     freeze_vit=freeze_vit,
        #     llama_model=llama_model,
        #     max_txt_len=max_txt_len,
        #     low_resource=low_resource,
        #     end_sym=end_sym
        # )
        model = cls(cfg)

        # load checkpoint
        ckpt_path = cfg.get("ckpt", "")
        if ckpt_path:
            print("Load Checkpoint: {}".format(ckpt_path))
            ckpt = torch.load(ckpt_path, map_location="cpu")
            if finetune:
                current_model_dict = model.state_dict()
                weights = ckpt['model']
                new_state_dict = {}
                for k in list(current_model_dict.keys()):
                    if k in list(weights.keys()):
                        if weights[k].size() == current_model_dict[k].size():
                            new_state_dict[k] = weights[k]
                        else:
                            new_state_dict[k] = current_model_dict[k]
                    else:
                        print(k)
                        new_state_dict[k] = current_model_dict[k]
                msg = model.load_state_dict(new_state_dict, strict=False)
            else:
                msg = model.load_state_dict(ckpt['model'], strict=False)

        return model