oleksandrfluxon
/

mpt-7b-instruct-evaluate

@@ -1,16 +1,47 @@
 import torch
 from typing import Any, Dict
-from transformers import AutoModelForCausalLM, AutoTokenizer
 class EndpointHandler:
     def __init__(self, path=""):
         # load model and tokenizer from path
-        self.tokenizer = AutoTokenizer.from_pretrained(path)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
         )
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
@@ -18,16 +49,17 @@ class EndpointHandler:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
-        # preprocess
-        inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
-        # pass inputs with all kwargs in data
-        if parameters is not None:
-            outputs = self.model.generate(**inputs, **parameters)
-        else:
-            outputs = self.model.generate(**inputs)
-        # postprocess the prediction
-        prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        return [{"generated_text": prediction}]

 import torch
 from typing import Any, Dict
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+from accelerate import dispatch_model, infer_auto_device_map
+from accelerate.utils import get_balanced_memory
 class EndpointHandler:
     def __init__(self, path=""):
+        config = AutoConfig.from_pretrained(
+                path,
+                trust_remote_code=True
+            )
+        # config.attn_config['attn_impl'] = 'triton'
+        config.init_device = 'cuda:0' # For fast initialization directly on GPU!
+        config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
         # load model and tokenizer from path
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
+        model = AutoModelForCausalLM.from_pretrained(
+            path,
+            config,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            trust_remote_code=True
+        )
+        max_memory = get_balanced_memory(
+            model,
+            max_memory=None,
+            no_split_module_classes=["MPTBlock"],
+            dtype='float16',
+            low_zero=False
+        )
+        device_map = infer_auto_device_map(
+            model,
+            max_memory=max_memory,
+            no_split_module_classes=["MPTBlock"],
+            dtype='float16'
         )
+        self.model = dispatch_model(model, device_map=device_map)
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         inputs = data.pop("inputs", data)
         parameters = data.pop("parameters", None)
+        with torch.autocast('cuda'):
+            # preprocess
+            inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
+            # pass inputs with all kwargs in data
+            if parameters is not None:
+                outputs = self.model.generate(**inputs, **parameters)
+            else:
+                outputs = self.model.generate(**inputs)
+            # postprocess the prediction
+            prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            return [{"generated_text": prediction}]