oleksandrfluxon
/

mpt-7b-instruct-evaluate

Text Generation

text-generation-inference

Model card Files Files and versions Community

oleksandrfluxon commited on Jul 21, 2023

Commit

2419282

•

1 Parent(s): ef35985

Update handler.py

Files changed (1) hide show

handler.py +34 -33

handler.py CHANGED Viewed

@@ -8,41 +8,42 @@ from accelerate.utils import get_balanced_memory
 class EndpointHandler:
     def __init__(self, path=""):
-        config = AutoConfig.from_pretrained(
-                path,
                 trust_remote_code=True
             )
-        # config.attn_config['attn_impl'] = 'triton'
-        config.init_device = 'cuda:0' # For fast initialization directly on GPU!
-        config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
-        # load model and tokenizer from path
-        self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
-        model = AutoModelForCausalLM.from_pretrained(
-            path,
-            config,
-            device_map="auto",
-            torch_dtype=torch.float16,
-            trust_remote_code=True
-        )
-        max_memory = get_balanced_memory(
-            model,
-            max_memory=None,
-            no_split_module_classes=["MPTBlock"],
-            dtype='float16',
-            low_zero=False
-        )
-        device_map = infer_auto_device_map(
-            model,
-            max_memory=max_memory,
-            no_split_module_classes=["MPTBlock"],
-            dtype='float16'
-        )
-        self.model = dispatch_model(model, device_map=device_map)
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         # process input

 class EndpointHandler:
     def __init__(self, path=""):
+        with torch.autocast('cuda'):
+            config = AutoConfig.from_pretrained(
+                    path,
+                    trust_remote_code=True
+                )
+            # config.attn_config['attn_impl'] = 'triton'
+            config.init_device = 'cuda:0' # For fast initialization directly on GPU!
+            config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
+            # load model and tokenizer from path
+            self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
+            model = AutoModelForCausalLM.from_pretrained(
+                path,
+                config,
+                device_map="auto",
+                torch_dtype=torch.float16,
                 trust_remote_code=True
             )
+            max_memory = get_balanced_memory(
+                model,
+                max_memory=None,
+                no_split_module_classes=["MPTBlock"],
+                dtype='float16',
+                low_zero=False
+            )
+            device_map = infer_auto_device_map(
+                model,
+                max_memory=max_memory,
+                no_split_module_classes=["MPTBlock"],
+                dtype='float16'
+            )
+            self.model = dispatch_model(model, device_map=device_map)
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
     def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
         # process input