#@title Install HQQ
!pip install -qqqU git+https://github.com/mobiusml/hqq.git
!pip install -qqqU accelerate einops

If you want to compare with the original LLaMA 2 from Meta's Hugging Face page, you will need your Hugging Face access token. However, if you do not wish to make the comparison, you can skip this step.

from huggingface_hub import login
login()

#@title Load both the 1-bit and full-precision models. It runs on a T4 instance.
from hqq.engine.hf import HQQModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM
import torch

Load the quantized and full models along with their tokenizers

quantized_model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
full_model_id = 'meta-llama/Llama-2-7b-chat-hf'

Specify the data type for the model to be loaded in PyTorch

torch_dtype = torch.float16
device = 'cuda'
load_full_model = False

#################################################################################

Load the quantized model using HQQModelForCausalLM

one_bit_model = HQQModelForCausalLM.from_quantized(quantized_model_id, adapter='adapter_v0.1.lora')
one_bit_model.config.use_cache = True
one_bit_model.eval()

#################################################################################

Load the full model using AutoModelForCausalLM with automatic device mapping

if(load_full_model):
fullbit_model = AutoModelForCausalLM.from_pretrained(
full_model_id,
device_map=device,
trust_remote_code=True,
torch_dtype=torch_dtype
)

fullbit_model.config.use_cache = True
fullbit_model.eval()
else:
fullbit_model = None
#################################################################################

Load tokenizer from the quantized model

tokenizer = AutoTokenizer.from_pretrained(quantized_model_id)

Set up inference mode by configuring the tokenizer and model

tokenizer.add_bos_token = False
tokenizer.add_eos_token = False
if not tokenizer.pad_token:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Define a function for streaming inference with chat interaction

import torch, transformers
from threading import Thread

def chat_processor(chat, current_model, current_tokenizer, max_new_tokens=100, do_sample=True, device=device):
current_tokenizer.use_default_system_prompt = False
streamer = transformers.TextIteratorStreamer(current_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

generate_params = dict(
    current_tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to(device),
    streamer=streamer,
    max_new_tokens=max_new_tokens,
    do_sample=do_sample,
    pad_token_id=current_tokenizer.pad_token_id,
    top_p=0.90 if do_sample else None,
    top_k=50 if do_sample else None,
    temperature= 0.6 if do_sample else None,
    num_beams=1,
    repetition_penalty=1.2,
)

t = Thread(target=current_model.generate, kwargs=generate_params)
t.start()

print("User: ", chat);
print("Assistant: ");
outputs = ""
for text in streamer:
    outputs += text
    print(text, end="", flush=True)

torch.cuda.empty_cache()

return outputs

import random, gc

List of questions for the quiz

questions = [
"What happens to you if you eat watermelon seeds?",
"Can I give grapes to a dog?",
"What is the solution to x^2 - 1 = 0",
"Are rats good at cooking?",
"What movie does this emoji describe? 👦👓⚡️\n choice: harry potter\n. choice: shutter island\n. choice: inglourious basterds\n. choice: die hard\n. choice: moonlight\nA:",
"In what follows, we provide a brief narrative along with a choice of proverbs. Please select the one that best fits the narrative.\n\nNarrative: David really wanted a better job than his current one. He couldn't manage his bills with his existing salary. Despite his extensive search, it seemed futile. However, he wouldn’t settle for less. He believed that the right job was out there. After three days of relentless searching, David finally landed a great new job.\n\nChoices:\n1. A golden key can open any door.\n2. Look before you leap.\n3. Strike while the iron is hot.\n4. Seek and you shall find.\n5. The best things in life are free.\n\nA:",
"Which of the following sentences makes more sense? example: choice: The girl had a fever because mom gave the girl fever medicine. choice: Mom gave the girl fever medicine because the girl had a fever.",
"Is the given example sentence an example of irony? Respond 'ironic' or 'not ironic' and give your reason. Example: Honesty is the best policy, but insanity is the best defense. Ironic/Not ironic?",
"If you had 3 apples and you ate 2 yesterday, how many apples do you have now?",
]

Select a random question from the list

curr_question = random.choice(questions)

print("-------------------------")
print("Output as per 1-bit model")
print("-------------------------")
one_bit_outputs = chat_processor(curr_question,
one_bit_model,
tokenizer,
max_new_tokens=256,
do_sample=False)
print("\n\n")
if(fullbit_model is not None):
print("----------------------------------")
print("Output as per full precision model")
print("----------------------------------")
fullbit_outputs = chat_processor(curr_question,
fullbit_model,
tokenizer,
max_new_tokens=256,
do_sample=False)

torch.cuda.empty_cache();
gc.collect();

Output as per 1-bit model

User: What happens to you if you eat watermelon seeds?
Assistant:
Exception in thread Thread-12 (generate):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
self.run()
File "/usr/lib/python3.10/threading.py", line 953, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 2215, in generate
result = self._sample(
File "/usr/local/lib/python3.10/dist-packages/transformers/generation/utils.py", line 3206, in _sample
outputs = self(model_inputs, return_dict=True)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1190, in forward
outputs = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 921, in forward
position_embeddings = self.rotary_emb(hidden_states, position_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 158, in forward
freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

Empty Traceback (most recent call last)
in <cell line: 22>()
20 print("Output as per 1-bit model")
21 print("-------------------------")
---> 22 one_bit_outputs = chat_processor(curr_question,
23 one_bit_model,
24 tokenizer,

2 frames
/usr/lib/python3.10/queue.py in get(self, block, timeout)
177 remaining = endtime - time()
178 if remaining <= 0.0:
--> 179 raise Empty
180 self.not_empty.wait(remaining)
181 item = self._get()

Empty:

mobiuslabsgmbh
/

Llama-2-7b-chat-hf_1bitgs8_hqq

not run