oleksandrfluxon
commited on
Commit
•
8506a61
1
Parent(s):
20e0039
Update handler.py
Browse files- handler.py +4 -2
handler.py
CHANGED
@@ -25,7 +25,8 @@ class EndpointHandler:
|
|
25 |
config,
|
26 |
device_map="auto",
|
27 |
torch_dtype=torch.float16,
|
28 |
-
trust_remote_code=True
|
|
|
29 |
)
|
30 |
|
31 |
max_memory = get_balanced_memory(
|
@@ -52,7 +53,8 @@ class EndpointHandler:
|
|
52 |
|
53 |
with torch.autocast('cuda'):
|
54 |
# preprocess
|
55 |
-
inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
|
|
|
56 |
|
57 |
# pass inputs with all kwargs in data
|
58 |
if parameters is not None:
|
|
|
25 |
config,
|
26 |
device_map="auto",
|
27 |
torch_dtype=torch.float16,
|
28 |
+
trust_remote_code=True,
|
29 |
+
load_in_8bit=True # Load model in the lowest 4-bit precision quantization
|
30 |
)
|
31 |
|
32 |
max_memory = get_balanced_memory(
|
|
|
53 |
|
54 |
with torch.autocast('cuda'):
|
55 |
# preprocess
|
56 |
+
# inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
|
57 |
+
inputs = self.tokenizer(inputs, return_tensors="pt")
|
58 |
|
59 |
# pass inputs with all kwargs in data
|
60 |
if parameters is not None:
|