Spaces:

aletrn
/

lisa-on-cuda

Paused

App Files Files Community

x-lai commited on Aug 3, 2023

Commit

968fffb

1 Parent(s): 0146331

support 4bit and 8bit inference

Browse files

Former-commit-id: 23930126323a0effb75929a5cc88c75c0d7bfbc2

Files changed (5) hide show

README.md +6 -1
chat.py +21 -4
model/LISA.py +71 -50
model/llava/model/llava.py +2 -0
model/segment_anything/modeling/image_encoder.py +7 -2

README.md CHANGED Viewed

@@ -53,10 +53,15 @@ To chat with [LISA-13B-llama2-v0](https://huggingface.co/xinlai/LISA-13B-llama2-
 ```
 CUDA_VISIBLE_DEVICES=0 python3 chat.py --version='xinlai/LISA-13B-llama2-v0'
 ```
-To use `bfloat16` data type for inference:
 ```
 CUDA_VISIBLE_DEVICES=0 python3 chat.py --version='xinlai/LISA-13B-llama2-v0' --precision='bf16'
 ```
 After that, input the text prompt and then the image path. For example，
 ```

 ```
 CUDA_VISIBLE_DEVICES=0 python3 chat.py --version='xinlai/LISA-13B-llama2-v0'
 ```
+To use `bf16` or `fp16` data type for inference:
 ```
 CUDA_VISIBLE_DEVICES=0 python3 chat.py --version='xinlai/LISA-13B-llama2-v0' --precision='bf16'
 ```
+To use `8bit` or `4bit` data type for inference:
+```
+CUDA_VISIBLE_DEVICES=0 python3 chat.py --version='xinlai/LISA-13B-llama2-v0' --precision='fp16' --load_in_8bit
+CUDA_VISIBLE_DEVICES=0 python3 chat.py --version='xinlai/LISA-13B-llama2-v0' --precision='fp16' --load_in_4bit
+```
 After that, input the text prompt and then the image path. For example，
 ```

chat.py CHANGED Viewed

@@ -17,19 +17,22 @@ def parse_args(args):
   parser = argparse.ArgumentParser(description='LISA chat')
   parser.add_argument('--version', default='xinlai/LISA-13B-llama2-v0')
   parser.add_argument('--vis_save_path', default='./vis_output', type=str)
-  parser.add_argument('--precision', default='bf16', type=str, choices=['fp32', 'bf16'], help="precision for inference")
   parser.add_argument('--image-size', default=1024, type=int, help='image size')
   parser.add_argument('--model-max-length', default=512, type=int)
   parser.add_argument('--lora-r', default=-1, type=int)
   parser.add_argument('--vision-tower', default='openai/clip-vit-large-patch14', type=str)
   parser.add_argument('--local-rank', default=0, type=int, help='node rank')
   return parser.parse_args(args)
 def preprocess(x,
-  pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
-  pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
-  img_size=1024) -> torch.Tensor:
     """Normalize pixel values and pad to a square input."""
     # Normalize colors
     x = (x - pixel_mean) / pixel_std
@@ -65,6 +68,8 @@ def main(args):
     args.version,
     args.lora_r,
     args.precision,
   )
   weight = {}
@@ -76,6 +81,14 @@ def main(args):
   if args.precision == 'bf16':
     model = model.bfloat16().cuda()
   else:
     model = model.float().cuda()
@@ -113,12 +126,16 @@ def main(args):
     original_size_list = [image.shape[:2]]
     if args.precision == 'bf16':
       images_clip = clip_image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().bfloat16()
     else:
       images_clip = clip_image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().float()
     images = transform.apply_image(image)
     resize_list = [images.shape[:2]]
     if args.precision == 'bf16':
       images = preprocess(torch.from_numpy(images).permute(2,0,1).contiguous()).unsqueeze(0).cuda().bfloat16()
     else:
       images = preprocess(torch.from_numpy(images).permute(2,0,1).contiguous()).unsqueeze(0).cuda().float()

   parser = argparse.ArgumentParser(description='LISA chat')
   parser.add_argument('--version', default='xinlai/LISA-13B-llama2-v0')
   parser.add_argument('--vis_save_path', default='./vis_output', type=str)
+  parser.add_argument('--precision', default='bf16', type=str, choices=['fp32', 'bf16', 'fp16'], help="precision for inference")
   parser.add_argument('--image-size', default=1024, type=int, help='image size')
   parser.add_argument('--model-max-length', default=512, type=int)
   parser.add_argument('--lora-r', default=-1, type=int)
   parser.add_argument('--vision-tower', default='openai/clip-vit-large-patch14', type=str)
   parser.add_argument('--local-rank', default=0, type=int, help='node rank')
+  parser.add_argument('--load_in_8bit', action='store_true', default=False)
+  parser.add_argument('--load_in_4bit', action='store_true', default=False)
   return parser.parse_args(args)
 def preprocess(x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024
+  ) -> torch.Tensor:
     """Normalize pixel values and pad to a square input."""
     # Normalize colors
     x = (x - pixel_mean) / pixel_std
     args.version,
     args.lora_r,
     args.precision,
+    load_in_8bit=args.load_in_8bit,
+    load_in_4bit=args.load_in_4bit,
   )
   weight = {}
   if args.precision == 'bf16':
     model = model.bfloat16().cuda()
+  elif args.precision == 'fp16':
+    import deepspeed
+    model_engine = deepspeed.init_inference(model=model,
+      dtype=torch.half,
+      replace_with_kernel_inject=True,
+      replace_method="auto",
+    )
+    model = model_engine.module
   else:
     model = model.float().cuda()
     original_size_list = [image.shape[:2]]
     if args.precision == 'bf16':
       images_clip = clip_image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().bfloat16()
+    elif args.precision == 'fp16':
+      images_clip = clip_image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().half()
     else:
       images_clip = clip_image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0].unsqueeze(0).cuda().float()
     images = transform.apply_image(image)
     resize_list = [images.shape[:2]]
     if args.precision == 'bf16':
       images = preprocess(torch.from_numpy(images).permute(2,0,1).contiguous()).unsqueeze(0).cuda().bfloat16()
+    elif args.precision == 'fp16':
+      images = preprocess(torch.from_numpy(images).permute(2,0,1).contiguous()).unsqueeze(0).cuda().half()
     else:
       images = preprocess(torch.from_numpy(images).permute(2,0,1).contiguous()).unsqueeze(0).cuda().float()

model/LISA.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 import transformers
-from transformers import LlamaForCausalLM, CLIPVisionModel
 from peft import (
     LoraConfig,
     get_peft_model,
@@ -49,6 +49,8 @@ class LISA(nn.Module):
     llm_version,
     lora_r,
     precision,
     lora_target_modules=['q_proj', 'v_proj'],
     lora_alpha=16,
     lora_dropout=0.05,
@@ -69,6 +71,20 @@ class LISA(nn.Module):
     num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
     if precision == "bf16":
       self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, torch_dtype=torch.bfloat16, cache_dir=None, low_cpu_mem_usage=True)
     else:
       self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, torch_dtype=torch.float32, cache_dir=None, low_cpu_mem_usage=True)
@@ -85,6 +101,8 @@ class LISA(nn.Module):
     if vision_tower.device.type == 'meta':
         if precision == 'bf16':
           vision_tower = CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).cuda(local_rank)
         else:
           vision_tower = CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=torch.float32, low_cpu_mem_usage=True).cuda(local_rank)
         self.lm.get_model().vision_tower[0] = vision_tower
@@ -92,6 +110,8 @@ class LISA(nn.Module):
         if precision == "bf16":
           vision_tower.to(device='cuda', dtype=torch.bfloat16)
         else:
           vision_tower.to(device='cuda', dtype=torch.float32)
@@ -135,58 +155,59 @@ class LISA(nn.Module):
   def evaluate(self, images_clip, images, input_ids, resize_list, original_size_list, max_new_tokens=32, tokenizer=None):
-    outputs = self.lm.generate(images=images_clip, input_ids=input_ids, max_new_tokens=max_new_tokens, num_beams=1, output_hidden_states=True, return_dict_in_generate=True)
-    output_hidden_states = outputs.hidden_states[-1]
-    output_ids = outputs.sequences
-    seg_token_mask = (output_ids[:, 1:] == self.seg_token_idx)
-    last_embedding = None
-    last_output_logit = None
-    hidden_states = []
-    assert len(self.text_hidden_fcs) == 1
-    hidden_states.append(self.text_hidden_fcs[0](output_hidden_states))
-    last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
-    pred_embeddings = last_hidden_state[seg_token_mask]
-    seg_token_counts = seg_token_mask.int().sum(-1) #[bs, ]
-    seg_token_offset = seg_token_counts.cumsum(-1)
-    seg_token_offset = torch.cat([torch.zeros(1).long().cuda(), seg_token_offset], dim=0)
-    pred_embeddings_ = []
-    for i in range(len(seg_token_offset)-1):
-      start_i, end_i = seg_token_offset[i], seg_token_offset[i+1]
-      pred_embeddings_.append(pred_embeddings[start_i: end_i])
-    pred_embeddings = pred_embeddings_
-    image_embeddings = self.get_visual_embs(images)
-    multimask_output = False
-    pred_masks = []
-    for i in range(len(pred_embeddings)):
-      sparse_embeddings, dense_embeddings = self.visual_model.prompt_encoder(
-          points=None,
-          boxes=None,
-          masks=None,
-          text_embeds=pred_embeddings[i].unsqueeze(1),
-      )
-      sparse_embeddings = sparse_embeddings.to(pred_embeddings[i].dtype)
-      low_res_masks, iou_predictions = self.visual_model.mask_decoder(
-          image_embeddings=image_embeddings[i].unsqueeze(0),
-          image_pe=self.visual_model.prompt_encoder.get_dense_pe(),
-          sparse_prompt_embeddings=sparse_embeddings,
-          dense_prompt_embeddings=dense_embeddings,
-          multimask_output=multimask_output,
-      )
-      pred_mask = self.visual_model.postprocess_masks(
-          low_res_masks,
-          input_size=resize_list[i],
-          original_size=original_size_list[i],
-      )
-      pred_masks.append(pred_mask[:, 0])
     return output_ids, pred_masks

 import torch.nn.functional as F
 import transformers
+from transformers import LlamaForCausalLM, CLIPVisionModel, BitsAndBytesConfig
 from peft import (
     LoraConfig,
     get_peft_model,
     llm_version,
     lora_r,
     precision,
+    load_in_4bit=False,
+    load_in_8bit=False,
     lora_target_modules=['q_proj', 'v_proj'],
     lora_alpha=16,
     lora_dropout=0.05,
     num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
     if precision == "bf16":
       self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, torch_dtype=torch.bfloat16, cache_dir=None, low_cpu_mem_usage=True)
+    elif precision == "fp16":
+      if load_in_4bit:
+        self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, load_in_4bit=True, cache_dir=None, low_cpu_mem_usage=True, device_map='auto',
+          quantization_config=BitsAndBytesConfig(
+              load_in_4bit=True,
+              bnb_4bit_compute_dtype=torch.float16,
+              bnb_4bit_use_double_quant=True,
+              bnb_4bit_quant_type='nf4'
+          )
+        )
+      elif load_in_8bit:
+        self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, load_in_8bit=True, cache_dir=None, low_cpu_mem_usage=True, device_map='auto')
+      else:
+        self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, torch_dtype=torch.half, cache_dir=None, low_cpu_mem_usage=True)
     else:
       self.lm = LlavaLlamaForCausalLM.from_pretrained(llm_version, torch_dtype=torch.float32, cache_dir=None, low_cpu_mem_usage=True)
     if vision_tower.device.type == 'meta':
         if precision == 'bf16':
           vision_tower = CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).cuda(local_rank)
+        elif precision == 'fp16':
+          vision_tower = CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=torch.half, low_cpu_mem_usage=True).cuda(local_rank)
         else:
           vision_tower = CLIPVisionModel.from_pretrained(vision_tower.config._name_or_path, torch_dtype=torch.float32, low_cpu_mem_usage=True).cuda(local_rank)
         self.lm.get_model().vision_tower[0] = vision_tower
         if precision == "bf16":
           vision_tower.to(device='cuda', dtype=torch.bfloat16)
+        elif precision == "fp16":
+          vision_tower.to(device='cuda', dtype=torch.half)
         else:
           vision_tower.to(device='cuda', dtype=torch.float32)
   def evaluate(self, images_clip, images, input_ids, resize_list, original_size_list, max_new_tokens=32, tokenizer=None):
+    with torch.no_grad():
+      outputs = self.lm.generate(images=images_clip, input_ids=input_ids, max_new_tokens=max_new_tokens, num_beams=1, output_hidden_states=True, return_dict_in_generate=True)
+      output_hidden_states = outputs.hidden_states[-1]
+      output_ids = outputs.sequences
+      seg_token_mask = (output_ids[:, 1:] == self.seg_token_idx)
+      last_embedding = None
+      last_output_logit = None
+      hidden_states = []
+      assert len(self.text_hidden_fcs) == 1
+      hidden_states.append(self.text_hidden_fcs[0](output_hidden_states))
+      last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
+      pred_embeddings = last_hidden_state[seg_token_mask]
+      seg_token_counts = seg_token_mask.int().sum(-1) #[bs, ]
+      seg_token_offset = seg_token_counts.cumsum(-1)
+      seg_token_offset = torch.cat([torch.zeros(1).long().cuda(), seg_token_offset], dim=0)
+      pred_embeddings_ = []
+      for i in range(len(seg_token_offset)-1):
+        start_i, end_i = seg_token_offset[i], seg_token_offset[i+1]
+        pred_embeddings_.append(pred_embeddings[start_i: end_i])
+      pred_embeddings = pred_embeddings_
+      image_embeddings = self.get_visual_embs(images)
+      multimask_output = False
+      pred_masks = []
+      for i in range(len(pred_embeddings)):
+        sparse_embeddings, dense_embeddings = self.visual_model.prompt_encoder(
+            points=None,
+            boxes=None,
+            masks=None,
+            text_embeds=pred_embeddings[i].unsqueeze(1),
+        )
+        sparse_embeddings = sparse_embeddings.to(pred_embeddings[i].dtype)
+        low_res_masks, iou_predictions = self.visual_model.mask_decoder(
+            image_embeddings=image_embeddings[i].unsqueeze(0),
+            image_pe=self.visual_model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+        )
+        pred_mask = self.visual_model.postprocess_masks(
+            low_res_masks,
+            input_size=resize_list[i],
+            original_size=original_size_list[i],
+        )
+        pred_masks.append(pred_mask[:, 0])
     return output_ids, pred_masks

model/llava/model/llava.py CHANGED Viewed

@@ -63,6 +63,8 @@ class LlavaLlamaModel(LlamaModel):
         vision_tower.requires_grad_(False)
         if precision == 'bf16':
             vision_tower = vision_tower.to(torch.bfloat16)
         else:
             vision_tower = vision_tower.to(torch.float32)

         vision_tower.requires_grad_(False)
         if precision == 'bf16':
             vision_tower = vision_tower.to(torch.bfloat16)
+        elif precision == 'fp16':
+            vision_tower = vision_tower.to(torch.half)
         else:
             vision_tower = vision_tower.to(torch.float32)

model/segment_anything/modeling/image_encoder.py CHANGED Viewed

@@ -114,8 +114,13 @@ class ImageEncoderViT(nn.Module):
         for blk in self.blocks:
             x = blk(x)
-        x = self.neck(x.permute(0, 3, 1, 2))
         return x

         for blk in self.blocks:
             x = blk(x)
+        dtype = x.dtype
+        if dtype == torch.float16: # prevent overflow
+            with torch.autocast(device_type='cuda', dtype=torch.float32):
+                x = self.neck(x.permute(0, 3, 1, 2))
+            x = x.to(dtype)
+        else:
+            x = self.neck(x.permute(0, 3, 1, 2))
         return x