Docker start with vllm failed. Official vllm docker image 0.7.3
sudo docker run --ipc=host --log-opt max-size=10m --log-opt max-file=1 --rm -it --gpus '"device=0"' -p 9000:8000 --mount type=bind,source=/home/me/.cache,target=/root/.cache vllm/vllm-openai:v0.7.3 --model cognitivecomputations/DeepSeek-V3-AWQ --tensor-parallel-size 1 --gpu-memory-utilization 0.90 --max-model-len 6000 --dtype half --disable-log-requests --trust-remote-code --cpu-offload 450
Latest vllm 0.7.3.
Failed:
Loading safetensors checkpoint shards: 100% Completed | 36/36 [06:44<00:00, 10.25s/it]
Loading safetensors checkpoint shards: 100% Completed | 36/36 [06:44<00:00, 11.24s/it]
ERROR 02-20 23:42:47 engine.py:400] A is not on GPU
ERROR 02-20 23:42:47 engine.py:400] Traceback (most recent call last):
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 39
1, in run_mp_engine
ERROR 02-20 23:42:47 engine.py:400] engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 12
4, in from_engine_args
ERROR 02-20 23:42:47 engine.py:400] return cls(ipc_path=ipc_path, ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 76
, in init
ERROR 02-20 23:42:47 engine.py:400] self.engine = LLMEngine(*args, **kwargs)
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 273, in init
__
ERROR 02-20 23:42:47 engine.py:400] self.model_executor = executor_class(vllm_config=vllm_config, )
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 52, in __
init
ERROR 02-20 23:42:47 engine.py:400] self._init_executor()
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 47, in
_init_executor ERROR 02-20 23:42:47 engine.py:400] self.collective_rpc("load_model")
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 56, in
collective_rpc
ERROR 02-20 23:42:47 engine.py:400] answer = run_method(self.driver_worker, method, args, kwargs)
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2196, in run_method
ERROR 02-20 23:42:47 engine.py:400] return func(*args, **kwargs)
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model
ERROR 02-20 23:42:47 engine.py:400] self.model_runner.load_model()
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1112, in load_model ERROR 02-20 23:42:47 engine.py:400] self.model = get_model(vllm_config=self.vllm_config) ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/init.py",
line 14, in get_model
ERROR 02-20 23:42:47 engine.py:400] return loader.load_model(vllm_config=vllm_config)
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", li
ne 420, in load_model
ERROR 02-20 23:42:47 engine.py:400] _process_weights_after_loading(model, model_config, target_device)
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", li
ne 177, in process_weights_after_loading
ERROR 02-20 23:42:47 engine.py:400] module.process_weights_after_loading(model_config.dtype)
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/layer.py", line 222, in process
weights_after_loading
ERROR 02-20 23:42:47 engine.py:400] self.impl.process_weights_after_loading(act_dtype)
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/backends/mla/utils.py", line 297
, in process_weights_after_loading
ERROR 02-20 23:42:47 engine.py:400] kv_b_proj_weight = get_and_maybe_dequant_weights(self.kv_b_proj).T
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/attention/backends/mla/utils.py", line 285
, in get_and_maybe_dequant_weights
ERROR 02-20 23:42:47 engine.py:400] dequant_weights = layer.quant_method.apply(layer,
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/awq_mar
lin.py", line 303, in apply
ERROR 02-20 23:42:47 engine.py:400] return apply_awq_marlin_linear(
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/m
arlin_utils.py", line 348, in apply_awq_marlin_linear
ERROR 02-20 23:42:47 engine.py:400] output = ops.gptq_marlin_gemm(reshaped_x,
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/custom_ops.py", line 690, in gptq_marlin
gemm
ERROR 02-20 23:42:47 engine.py:400] return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, b_zeros, ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/torch/_ops.py", line 1116, in call
ERROR 02-20 23:42:47 engine.py:400] return self._op(*args, **(kwargs or {}))
ERROR 02-20 23:42:47 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 02-20 23:42:47 engine.py:400] RuntimeError: A is not on GPU
Error with "-q awq" quantization:
ERROR 02-21 00:48:48 engine.py:400]
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine
engine = MQLLMEngine.from_engine_args(engine_args=engine_args,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args
return cls(ipc_path=ipc_path,
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 76, in init
self.engine = LLMEngine(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 273, in init
self.model_executor = executor_class(vllm_config=vllm_config, )
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 52, in init
self._init_executor()
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor
self.collective_rpc("load_model")
File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2196, in run_method
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model
self.model_runner.load_model()
File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1112, in load_model
self.model = get_model(vllm_config=self.vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/init.py", line 14, in get_model
return loader.load_model(vllm_config=vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 406, in load_model
model = _initialize_model(vllm_config=vllm_config)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model
return model_class(vllm_config=vllm_config, prefix=prefix)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py", line 655, in init
self.model = DeepseekV2Model(vllm_config=vllm_config,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 151, in init
old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py", line 589, in init
self.start_layer, self.end_layer, self.layers = make_layers(
^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 558, in make_layers
maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py", line 591, in
lambda prefix: DeepseekV2DecoderLayer(
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py", line 517, in init
self.mlp = DeepseekV2MoE(
^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py", line 129, in init
self.experts = FusedMoE(
^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py", line 294, in init
assert self.quant_method is not None
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError
You must NOT use CPU offload, please use GPU only.