Update README.md (#35)

Browse files

- Update README.md (aab9b2b21dda7698508d9f571593044845cb62e5)
- Update README.md (bd8be0b392d5d5df896719a73568103069925868)

Co-authored-by: Vaibhav Srivastav <reach-vb@users.noreply.huggingface.co>

Files changed (1) hide show

README.md +126 -21

README.md CHANGED Viewed

@@ -45,11 +45,37 @@ state of the art AI models and helping foster innovation for everyone.
 ### Usage
-Below we share some code snippets on how to get quickly started with running the model. First make sure to `pip install -U transformers`, then copy the snippet from the section that is relevant for your usecase.
-#### Running the model on a single / multi GPU
 ```python
 # pip install accelerate
@@ -60,13 +86,24 @@ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
     device_map="auto",
-    torch_dtype=torch.bfloat16
 )
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
@@ -86,18 +123,32 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
-    device_map="auto")
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
 #### Quantized Versions through `bitsandbytes`
-* _Using 8-bit precision (int8)_
 ```python
 # pip install bitsandbytes accelerate
@@ -108,16 +159,21 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
-    quantization_config=quantization_config)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
-* _Using 4-bit precision_
 ```python
 # pip install bitsandbytes accelerate
@@ -128,30 +184,79 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
-    quantization_config=quantization_config)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
-#### Other optimizations
-* _Flash Attention 2_
-First make sure to install `flash-attn` in your environment `pip install flash-attn`
-```diff
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-+   attn_implementation="flash_attention_2"
-).to(0)
 ```
 ### Chat Template
 The instruction-tuned models use a chat template that must be adhered to for conversational use.

 ### Usage
+Below we share some code snippets on how to get quickly started with running the model. First, install the Transformers library with:
+```sh
+pip install -U transformers
+```
+Then, copy the snippet from the section that is relevant for your usecase.
+#### Running with the `pipeline` API
+```python
+import torch
+from transformers import pipeline
+pipe = pipeline(
+    "text-generation",
+    model="google/gemma-2-9b-it",
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    device="cuda",  # replace with "mps" to run on a Mac device
+)
+messages = [
+    {"role": "user", "content": "Who are you? Please, answer in pirate-speak."},
+]
+outputs = pipe(messages, max_new_tokens=256)
+assistant_response = outputs[0]["generated_text"][-1]["content"].strip()
+print(assistant_response)
+# Ahoy, matey! I be Gemma, a digital scallywag, a language-slingin' parrot of the digital seas. I be here to help ye with yer wordy woes, answer yer questions, and spin ye yarns of the digital world.  So, what be yer pleasure, eh? 🦜
+```
+#### Running the model on a single / multi GPU
 ```python
 # pip install accelerate
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
     device_map="auto",
+    torch_dtype=torch.bfloat16,
 )
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
+print(tokenizer.decode(outputs[0]))
+```
+You can ensure the correct chat template is applied by using `tokenizer.apply_chat_template` as follows:
+```python
+messages = [
+    {"role": "user", "content": "Write me a poem about Machine Learning."},
+]
+input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=256)
 print(tokenizer.decode(outputs[0]))
 ```
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
+    device_map="auto",
+)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
 print(tokenizer.decode(outputs[0]))
 ```
+#### Running the model through a CLI
+The [local-gemma](https://github.com/huggingface/local-gemma) repository contains a lightweight wrapper around Transformers
+for running Gemma 2 through a command line interface, or CLI. Follow the [installation instructions](https://github.com/huggingface/local-gemma#cli-usage)
+for getting started, then launch the CLI through the following command:
+```shell
+local-gemma --model 9b --preset speed
+```
 #### Quantized Versions through `bitsandbytes`
+<details>
+  <summary>
+    Using 8-bit precision (int8)
+  </summary>
 ```python
 # pip install bitsandbytes accelerate
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
+    quantization_config=quantization_config,
+)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
 print(tokenizer.decode(outputs[0]))
 ```
+</details>
+<details>
+  <summary>
+    Using 4-bit precision
+  </summary>
 ```python
 # pip install bitsandbytes accelerate
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b-it",
+    quantization_config=quantization_config,
+)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
 print(tokenizer.decode(outputs[0]))
 ```
+</details>
+#### Advanced Usage
+<details>
+  <summary>
+    Torch compile
+  </summary>
+[Torch compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) is a method for speeding-up the
+inference of PyTorch modules. The Gemma-2 model can be run up to 6x faster by leveraging torch compile.
+Note that two warm-up steps are required before the full inference speed is realised:
+```python
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from transformers import AutoTokenizer, Gemma2ForCausalLM
+from transformers.cache_utils import HybridCache
+import torch
+torch.set_float32_matmul_precision("high")
+# load the model + tokenizer
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
+model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b-it", torch_dtype=torch.bfloat16)
+model.to("cuda")
+# apply the torch compile transformation
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+# pre-process inputs
+input_text = "The theory of special relativity states "
+model_inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+prompt_length = model_inputs.input_ids.shape[1]
+# set-up k/v cache
+past_key_values = HybridCache(
+    config=model.config,
+    max_batch_size=1,
+    max_cache_len=model.config.max_position_embeddings,
+    device=model.device,
+    dtype=model.dtype
+)
+# enable passing kv cache to generate
+model._supports_cache_class = True
+model.generation_config.cache_implementation = None
+# two warm-up steps
+for idx in range(2):
+    outputs = model.generate(**model_inputs, past_key_values=past_key_values, do_sample=True, temperature=1.0, max_new_tokens=128)
+    past_key_values.reset()
+# fast run
+outputs = model.generate(**model_inputs, past_key_values=past_key_values, do_sample=True, temperature=1.0, max_new_tokens=128)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
+For more details, refer to the [Transformers documentation](https://huggingface.co/docs/transformers/main/en/llm_optims?static-kv=basic+usage%3A+generation_config).
+</details>
 ### Chat Template
 The instruction-tuned models use a chat template that must be adhered to for conversational use.