oleksandrfluxon commited on
Commit
8506a61
1 Parent(s): 20e0039

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +4 -2
handler.py CHANGED
@@ -25,7 +25,8 @@ class EndpointHandler:
25
  config,
26
  device_map="auto",
27
  torch_dtype=torch.float16,
28
- trust_remote_code=True
 
29
  )
30
 
31
  max_memory = get_balanced_memory(
@@ -52,7 +53,8 @@ class EndpointHandler:
52
 
53
  with torch.autocast('cuda'):
54
  # preprocess
55
- inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
 
56
 
57
  # pass inputs with all kwargs in data
58
  if parameters is not None:
 
25
  config,
26
  device_map="auto",
27
  torch_dtype=torch.float16,
28
+ trust_remote_code=True,
29
+ load_in_8bit=True # Load model in the lowest 4-bit precision quantization
30
  )
31
 
32
  max_memory = get_balanced_memory(
 
53
 
54
  with torch.autocast('cuda'):
55
  # preprocess
56
+ # inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
57
+ inputs = self.tokenizer(inputs, return_tensors="pt")
58
 
59
  # pass inputs with all kwargs in data
60
  if parameters is not None: