Marlon Wiprud commited on
Commit
2e8c95f
·
1 Parent(s): a894d5f

single gpu

Browse files
Files changed (1) hide show
  1. handler.py +35 -35
handler.py CHANGED
@@ -40,48 +40,48 @@ class EndpointHandler:
40
 
41
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
42
 
43
- # self.model = (
44
- # AutoModelForCausalLM.from_pretrained(
45
- # "THUDM/cogvlm-chat-hf",
46
- # torch_dtype=torch.bfloat16,
47
- # low_cpu_mem_usage=True,
48
- # trust_remote_code=True,
49
- # )
50
- # .to("cuda")
51
- # .eval()
52
- # )
53
-
54
- # DISTRIBUTED GPUS
55
- with init_empty_weights():
56
- self.model = AutoModelForCausalLM.from_pretrained(
57
  "THUDM/cogvlm-chat-hf",
58
  torch_dtype=torch.bfloat16,
59
  low_cpu_mem_usage=True,
60
  trust_remote_code=True,
61
  )
62
-
63
- # print("LISTING FILES IN ", "/root/.cache/huggingface")
64
- # list_files("/root/.cache/huggingface", 0, 5)
65
-
66
- device_map = infer_auto_device_map(
67
- self.model,
68
- max_memory={
69
- 0: "12GiB",
70
- 1: "12GiB",
71
- 2: "12GiB",
72
- 3: "12GiB",
73
- "cpu": "180GiB",
74
- },
75
- no_split_module_classes=["CogVLMDecoderLayer"],
76
  )
77
 
78
- self.model = load_checkpoint_and_dispatch(
79
- self.model,
80
- "/root/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",
81
- device_map=device_map,
82
- no_split_module_classes=["CogVLMDecoderLayer"],
83
- )
84
- self.model = self.model.eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  ## DISTRIBUTED GPUS
86
 
87
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
 
40
 
41
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
42
 
43
+ self.model = (
44
+ AutoModelForCausalLM.from_pretrained(
 
 
 
 
 
 
 
 
 
 
 
 
45
  "THUDM/cogvlm-chat-hf",
46
  torch_dtype=torch.bfloat16,
47
  low_cpu_mem_usage=True,
48
  trust_remote_code=True,
49
  )
50
+ .to("cuda")
51
+ .eval()
 
 
 
 
 
 
 
 
 
 
 
 
52
  )
53
 
54
+ # DISTRIBUTED GPUS
55
+ # with init_empty_weights():
56
+ # self.model = AutoModelForCausalLM.from_pretrained(
57
+ # "THUDM/cogvlm-chat-hf",
58
+ # torch_dtype=torch.bfloat16,
59
+ # low_cpu_mem_usage=True,
60
+ # trust_remote_code=True,
61
+ # )
62
+
63
+ # # print("LISTING FILES IN ", "/root/.cache/huggingface")
64
+ # # list_files("/root/.cache/huggingface", 0, 5)
65
+
66
+ # device_map = infer_auto_device_map(
67
+ # self.model,
68
+ # max_memory={
69
+ # 0: "12GiB",
70
+ # 1: "12GiB",
71
+ # 2: "12GiB",
72
+ # 3: "12GiB",
73
+ # "cpu": "180GiB",
74
+ # },
75
+ # no_split_module_classes=["CogVLMDecoderLayer"],
76
+ # )
77
+
78
+ # self.model = load_checkpoint_and_dispatch(
79
+ # self.model,
80
+ # "/root/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",
81
+ # device_map=device_map,
82
+ # no_split_module_classes=["CogVLMDecoderLayer"],
83
+ # )
84
+ # self.model = self.model.eval()
85
  ## DISTRIBUTED GPUS
86
 
87
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: