apoorvkh
/

blip2-flan-t5-xxl-endpoint

apoorvkh commited on Jun 18, 2023

Commit

665939a

•

1 Parent(s): 94d713e

Adding cpu offloading (if necessary?)

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -19,14 +19,11 @@ class EndpointHandler():
             model = Blip2ForConditionalGeneration(config)
             device_map = infer_auto_device_map(model, no_split_module_classes=["T5Block"])
         device_map['language_model.lm_head'] = device_map["language_model.encoder.embed_tokens"]
-        print(device_map)
-        exit()
         self.model = Blip2ForConditionalGeneration.from_pretrained(
             "Salesforce/blip2-flan-t5-xxl", device_map=device_map,
             # torch_dtype=torch.float16
-            load_in_8bit=True,
         )
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:

             model = Blip2ForConditionalGeneration(config)
             device_map = infer_auto_device_map(model, no_split_module_classes=["T5Block"])
         device_map['language_model.lm_head'] = device_map["language_model.encoder.embed_tokens"]
         self.model = Blip2ForConditionalGeneration.from_pretrained(
             "Salesforce/blip2-flan-t5-xxl", device_map=device_map,
             # torch_dtype=torch.float16
+            load_in_8bit=True, load_in_8bit_fp32_cpu_offload=True
         )
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: