colab t4
it run with
from transformers import pipeline
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ModelCloud/gemma-2-9b-it-gptq-4bit")
pipe(messages)
not run with
import os
Gemma-2 use Flashinfer backend for models with logits_soft_cap. Otherwise, the output might be wrong.
os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
from transformers import AutoTokenizer
from gptqmodel import BACKEND, GPTQModel
model_name = "ModelCloud/gemma-2-27b-it-gptq-4bit"
prompt = [{"role": "user", "content": "I am in Shanghai, preparing to visit the natural history museum. Can you tell me the best way to"}]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPTQModel.from_quantized(
model_name,
backend=BACKEND.VLLM,
)
inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
outputs = model.generate(prompts=inputs, temperature=0.95, max_length=128)
print(outputs[0].outputs[0].text)
import os
Gemma-2 use Flashinfer backend for models with logits_soft_cap. Otherwise, the output might be wrong.
os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'
from transformers import AutoTokenizer
from gptqmodel import BACKEND, GPTQModel
model_name = "ModelCloud/gemma-2-9b-it-gptq-4bit"
prompt = [{"role": "user", "content": "I am in Shanghai, preparing to visit the natural history museum. Can you tell me the best way to"}]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPTQModel.from_quantized(
model_name,
backend=BACKEND.VLLM,
)
inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
outputs = model.generate(prompts=inputs, temperature=0.95, max_length=128)
print(outputs[0].outputs[0].text)
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:797: FutureWarning: resume_download
is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True
.
warnings.warn(
INFO - Ignoring unknown parameter in the quantization configuration: model_name_or_path.
INFO - Ignoring unknown parameter in the quantization configuration: model_file_base_name.
WARNING 12-10 23:57:32 config.py:200] Gemma 2 uses sliding window attention for every odd layer, which is currently not supported by vLLM. Disabling sliding window and capping the max length to the sliding window size (4096).
WARNING 12-10 23:57:32 config.py:319] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
INFO 12-10 23:57:32 llm_engine.py:234] Initializing an LLM engine (v0.6.3.dev28+g33f460b1.d20240927) with config: model='ModelCloud/gemma-2-9b-it-gptq-4bit', speculative_config=None, tokenizer='ModelCloud/gemma-2-9b-it-gptq-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=ModelCloud/gemma-2-9b-it-gptq-4bit, use_v2_block_manager=False, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)
INFO 12-10 23:57:35 selector.py:142] Using Flashinfer backend.
ValueError Traceback (most recent call last)
in <cell line: 14>()
12 tokenizer = AutoTokenizer.from_pretrained(model_name)
13
---> 14 model = GPTQModel.from_quantized(
15 model_name,
16 backend=BACKEND.VLLM,
9 frames
/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py in _check_if_gpu_supports_dtype(torch_dtype)
466 compute_str = f"has compute capability {version_str}"
467
--> 468 raise ValueError(
469 "Bfloat16 is only supported on GPUs with compute capability "
470 f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
ValueError: Bfloat16 is only supported on GPUs with compute capability of at least 8.0. Your Tesla T4 GPU has compute capability 7.5. You can use float16 instead by explicitly setting thedtype
flag in CLI, for example: --dtype=half.
from transformers import pipeline
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ModelCloud/gemma-2-9b-it-gptq-4bit")
pipe(messages)
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:411: FutureWarning: torch.cuda.amp.custom_fwd(args...)
is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda')
instead.
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:419: FutureWarning: torch.cuda.amp.custom_bwd(args...)
is deprecated. Please use torch.amp.custom_bwd(args..., device_type='cuda')
instead.
def backward(ctx, grad_output):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:461: FutureWarning: torch.cuda.amp.custom_fwd(args...)
is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda')
instead.
@custom_fwd(cast_inputs=torch.float16)
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed.
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda_old:CUDA extension not installed.low_cpu_mem_usage
was None, now default to True since model is quantized.
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:5055: FutureWarning: _is_quantized_training_enabled
is going to be deprecated in transformers 4.39.0. Please use model.hf_quantizer.is_trainable
instead
warnings.warn(loss_type=None
was set in the config but it is unrecognised.Using the default loss: ForCausalLMLoss
.
WARNING:optimum.gptq.quantizer:Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting disable_exllama=True
Device set to use cuda:0
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant',
'content': 'I am Gemma, an open-weights AI assistant. I am a large language model trained by Google'}]}]
it is run with
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
Define GPTQ configuration
gptq_config = GPTQConfig(bits=2, use_exllama=True, use_cuda_fp16=True)
Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
"ModelCloud/gemma-2-9b-gptq-4bit",
device_map="auto",
quantization_config=gptq_config
)
Load the tokenizer # This line was missing
tokenizer = AutoTokenizer.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit")
Generate and print the output
print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
[ ]
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
Define GPTQ configuration
gptq_config = GPTQConfig(bits=2, use_exllama=True, use_cuda_fp16=True)
Load the quantized model
model = AutoModelForCausalLM.from_pretrained(
"ModelCloud/gemma-2-9b-gptq-4bit",
device_map="auto",
quantization_config=gptq_config
)
Load the tokenizer # This line was missing
tokenizer = AutoTokenizer.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit")
Generate and print the output
print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN
does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/quantizers/auto.py:186: UserWarning: You passed quantization_config
or equivalent parameters to from_pretrained
but the model you're loading already has a quantization_config
attribute. The quantization_config
from the model will be used.However, loading attributes (e.g. ['use_cuda_fp16', 'use_exllama', 'max_input_length', 'exllama_config', 'disable_exllama']) will be overwritten with the one you passed to from_pretrained
. The rest will be ignored.
warnings.warn(warning_msg)
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:411: FutureWarning: torch.cuda.amp.custom_fwd(args...)
is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda')
instead.
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:419: FutureWarning: torch.cuda.amp.custom_bwd(args...)
is deprecated. Please use torch.amp.custom_bwd(args..., device_type='cuda')
instead.
def backward(ctx, grad_output):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:461: FutureWarning: torch.cuda.amp.custom_fwd(args...)
is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda')
instead.
@custom_fwd(cast_inputs=torch.float16)
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed.
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda_old:CUDA extension not installed.
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:5055: FutureWarning: _is_quantized_training_enabled
is going to be deprecated in transformers 4.39.0. Please use model.hf_quantizer.is_trainable
instead
warnings.warn(loss_type=None
was set in the config but it is unrecognised.Using the default loss: ForCausalLMLoss
.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
gptqmodel is a Python library. gptqmodel has no vulnerabilities, it has a Permissive License and it
run with
import torch
from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer
gptq_config = GPTQConfig(bits=2, use_exllama=True, use_cuda_fp16=True)
model = AutoModelForCausalLM.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit", device_map="auto", quantization_config=gptq_config)
tokenizer = AutoTokenizer.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit")
print(tokenizer.decode(model.generate(**tokenizer("ai is", return_tensors="pt").to(model.device))[0]))
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
ai is a very good tool for the future. It can help us to do many things. For example,
!pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3