oleksandrfluxon
/

mpt-7b-instruct-evaluate

Text Generation

text-generation-inference

Model card Files Files and versions Community

oleksandrfluxon commited on Jul 21, 2023

Commit

8506a61

•

1 Parent(s): 20e0039

Update handler.py

Files changed (1) hide show

handler.py +4 -2

handler.py CHANGED Viewed

@@ -25,7 +25,8 @@ class EndpointHandler:
                 config,
                 device_map="auto",
                 torch_dtype=torch.float16,
-                trust_remote_code=True
             )
             max_memory = get_balanced_memory(
@@ -52,7 +53,8 @@ class EndpointHandler:
         with torch.autocast('cuda'):
             # preprocess
-            inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
             # pass inputs with all kwargs in data
             if parameters is not None:

                 config,
                 device_map="auto",
                 torch_dtype=torch.float16,
+                trust_remote_code=True,
+                load_in_8bit=True # Load model in the lowest 4-bit precision quantization
             )
             max_memory = get_balanced_memory(
         with torch.autocast('cuda'):
             # preprocess
+            # inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
+            inputs = self.tokenizer(inputs, return_tensors="pt")
             # pass inputs with all kwargs in data
             if parameters is not None: