oleksandrfluxon commited on
Commit
2419282
1 Parent(s): ef35985

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +34 -33
handler.py CHANGED
@@ -8,41 +8,42 @@ from accelerate.utils import get_balanced_memory
8
 
9
  class EndpointHandler:
10
  def __init__(self, path=""):
11
- config = AutoConfig.from_pretrained(
12
- path,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  trust_remote_code=True
14
  )
15
- # config.attn_config['attn_impl'] = 'triton'
16
- config.init_device = 'cuda:0' # For fast initialization directly on GPU!
17
- config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
18
-
19
-
20
- # load model and tokenizer from path
21
- self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
22
- model = AutoModelForCausalLM.from_pretrained(
23
- path,
24
- config,
25
- device_map="auto",
26
- torch_dtype=torch.float16,
27
- trust_remote_code=True
28
- )
29
-
30
- max_memory = get_balanced_memory(
31
- model,
32
- max_memory=None,
33
- no_split_module_classes=["MPTBlock"],
34
- dtype='float16',
35
- low_zero=False
36
- )
37
- device_map = infer_auto_device_map(
38
- model,
39
- max_memory=max_memory,
40
- no_split_module_classes=["MPTBlock"],
41
- dtype='float16'
42
- )
43
- self.model = dispatch_model(model, device_map=device_map)
44
-
45
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
46
 
47
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
48
  # process input
 
8
 
9
  class EndpointHandler:
10
  def __init__(self, path=""):
11
+ with torch.autocast('cuda'):
12
+ config = AutoConfig.from_pretrained(
13
+ path,
14
+ trust_remote_code=True
15
+ )
16
+ # config.attn_config['attn_impl'] = 'triton'
17
+ config.init_device = 'cuda:0' # For fast initialization directly on GPU!
18
+ config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
19
+
20
+
21
+ # load model and tokenizer from path
22
+ self.tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
23
+ model = AutoModelForCausalLM.from_pretrained(
24
+ path,
25
+ config,
26
+ device_map="auto",
27
+ torch_dtype=torch.float16,
28
  trust_remote_code=True
29
  )
30
+
31
+ max_memory = get_balanced_memory(
32
+ model,
33
+ max_memory=None,
34
+ no_split_module_classes=["MPTBlock"],
35
+ dtype='float16',
36
+ low_zero=False
37
+ )
38
+ device_map = infer_auto_device_map(
39
+ model,
40
+ max_memory=max_memory,
41
+ no_split_module_classes=["MPTBlock"],
42
+ dtype='float16'
43
+ )
44
+ self.model = dispatch_model(model, device_map=device_map)
45
+
46
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
49
  # process input