oleksandrfluxon commited on
Commit
1ba92de
1 Parent(s): f5d256b

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +19 -47
pipeline.py CHANGED
@@ -2,59 +2,32 @@ import torch
2
  import transformers
3
  from accelerate import dispatch_model, infer_auto_device_map
4
  from accelerate.utils import get_balanced_memory
 
5
  from typing import Dict, List, Any
6
 
7
  class PreTrainedPipeline():
8
  def __init__(self, path=""):
9
  path = "oleksandrfluxon/mpt-7b-instruct-evaluate"
10
  print("===> path", path)
11
-
12
- with torch.autocast('cuda'):
13
- config = transformers.AutoConfig.from_pretrained(
14
- path,
15
- trust_remote_code=True
16
- )
17
- # config.attn_config['attn_impl'] = 'triton'
18
- config.init_device = 'cuda:0' # For fast initialization directly on GPU!
19
- config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
20
-
21
- print("===> loading model")
22
- model = transformers.AutoModelForCausalLM.from_pretrained(
23
- path,
24
- config=config,
25
- # torch_dtype=torch.bfloat16, # Load model weights in bfloat16
26
- torch_dtype=torch.float16,
27
- trust_remote_code=True,
28
- device_map={"": 0},
29
- load_in_8bit=True # Load model in the lowest 4-bit precision quantization
30
- )
31
- # model.to('cuda')
32
- print("===> model loaded")
33
-
34
- # removed device_map="auto"
35
- tokenizer = transformers.AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
36
 
 
 
37
 
38
- max_memory = get_balanced_memory(
39
- model,
40
- max_memory=None,
41
- no_split_module_classes=["MPTBlock"],
42
- dtype='float16',
43
- low_zero=False
44
- )
45
-
46
- device_map = infer_auto_device_map(
47
- model,
48
- max_memory=max_memory,
49
- no_split_module_classes=["MPTBlock"],
50
- dtype='float16'
51
- )
52
- model = dispatch_model(model, device_map=device_map)
53
-
54
 
55
- # device='cuda:0'
56
- self.pipeline = transformers.pipeline('text-generation', model=model, tokenizer=tokenizer)
57
- print("===> init finished")
 
58
 
59
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
60
  """
@@ -71,8 +44,7 @@ class PreTrainedPipeline():
71
  print("===> inputs", inputs)
72
  print("===> parameters", parameters)
73
 
74
- with torch.autocast('cuda'):
75
- result = self.pipeline(inputs, **parameters)
76
- print("===> result", result)
77
 
78
  return result
 
2
  import transformers
3
  from accelerate import dispatch_model, infer_auto_device_map
4
  from accelerate.utils import get_balanced_memory
5
+ from transformers import BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList
6
  from typing import Dict, List, Any
7
 
8
  class PreTrainedPipeline():
9
  def __init__(self, path=""):
10
  path = "oleksandrfluxon/mpt-7b-instruct-evaluate"
11
  print("===> path", path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
14
+ print("===> device", device)
15
 
16
+ model = transformers.AutoModelForCausalLM.from_pretrained(
17
+ 'oleksandrfluxon/mpt-7b-instruct-evaluate',
18
+ trust_remote_code=True,
19
+ load_in_8bit=True, # this requires the `bitsandbytes` library
20
+ max_seq_len=8192,
21
+ init_device=device
22
+ )
23
+ model.eval()
24
+ #model.to(device)
25
+ print(f"===> Model loaded on {device}")
 
 
 
 
 
 
26
 
27
+ tokenizer = transformers.AutoTokenizer.from_pretrained("mosaicml/mpt-7b")
28
+
29
+ self.pipeline = transformers.pipeline('text-generation', model=model, tokenizer=tokenizer)
30
+ print("===> init finished")
31
 
32
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
33
  """
 
44
  print("===> inputs", inputs)
45
  print("===> parameters", parameters)
46
 
47
+ result = self.pipeline(inputs, **parameters)
48
+ print("===> result", result)
 
49
 
50
  return result