out with the old in with the new (#2)

Browse files

- out with the old in with the new (992f7460f327f2bdb9f42ec7da2f545316efc3ab)

Co-authored-by: Joseph Seed <Delta-Vector@users.noreply.huggingface.co>

Files changed (2) hide show

README.md +95 -74
tokenizer.json +2 -2

README.md CHANGED Viewed

@@ -43,39 +43,32 @@ state of the art AI models and helping foster innovation for everyone.
 ### Usage
-Below we share some code snippets on how to get quickly started with running the model. First make sure to `pip install -U transformers`, then copy the snippet from the section that is relevant for your usecase.
-#### Running the model on a single / multi GPU
 ```python
-# pip install accelerate
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2-9b",
-    device_map="auto",
-    torch_dtype=torch.bfloat16
 )
-input_text = "Write me a poem about Machine Learning."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
-print(tokenizer.decode(outputs[0]))
 ```
-<a name="precisions"></a>
-#### Running the model on a GPU using different precisions
-The native weights of this model were exported in `bfloat16` precision. You can use `float16`, which may be faster on certain hardware, indicating the `torch_dtype` when loading the model. For convenience, the `float16` revision of the repo contains a copy of the weights already converted to that precision.
-You can also use `float32` if you skip the dtype, but no precision increase will occur (model weights will just be upcasted to `float32`). See examples below.
-* _Using `torch.float16`_
 ```python
 # pip install accelerate
@@ -86,57 +79,31 @@ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b",
     device_map="auto",
-    torch_dtype=torch.float16,
-    revision="float16",
 )
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
-* _Using `torch.bfloat16`_
-```python
-# pip install accelerate
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2-9b",
-    device_map="auto",
-    torch_dtype=torch.bfloat16)
-input_text = "Write me a poem about Machine Learning."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
-print(tokenizer.decode(outputs[0]))
-```
-* _Upcasting to `torch.float32`_
-```python
-# pip install accelerate
-from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2-9b",
-    device_map="auto")
-input_text = "Write me a poem about Machine Learning."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
-print(tokenizer.decode(outputs[0]))
 ```
 #### Quantized Versions through `bitsandbytes`
-* _Using 8-bit precision (int8)_
 ```python
 # pip install bitsandbytes accelerate
@@ -147,16 +114,21 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b",
-    quantization_config=quantization_config)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
-* _Using 4-bit precision_
 ```python
 # pip install bitsandbytes accelerate
@@ -167,30 +139,79 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b",
-    quantization_config=quantization_config)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-outputs = model.generate(**input_ids)
 print(tokenizer.decode(outputs[0]))
 ```
-#### Other optimizations
-* _Flash Attention 2_
-First make sure to install `flash-attn` in your environment `pip install flash-attn`
-```diff
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-+   attn_implementation="flash_attention_2"
-).to(0)
 ```
 ### Inputs and outputs
 *   **Input:** Text string, such as a question, a prompt, or a document to be

 ### Usage
+Below we share some code snippets on how to get quickly started with running the model. First, install the Transformers library with:
+```sh
+pip install -U transformers
+```
+Then, copy the snippet from the section that is relevant for your usecase.
+#### Running with the `pipeline` API
 ```python
 import torch
+from transformers import pipeline
+pipe = pipeline(
+    "text-generation",
+    model="google/gemma-2-9b",
+    device="cuda",  # replace with "mps" to run on a Mac device
 )
+text = "Once upon a time,"
+outputs = pipe(text, max_new_tokens=256)
+response = outputs[0]["generated_text"]
+print(response)
 ```
+#### Running the model on a single / multi GPU
 ```python
 # pip install accelerate
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b",
     device_map="auto",
 )
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
 print(tokenizer.decode(outputs[0]))
 ```
+#### Running the model through a CLI
+The [local-gemma](https://github.com/huggingface/local-gemma) repository contains a lightweight wrapper around Transformers
+for running Gemma 2 through a command line interface, or CLI. Follow the [installation instructions](https://github.com/huggingface/local-gemma#cli-usage)
+for getting started, then launch the CLI through the following command:
+```shell
+local-gemma --model "google/gemma-2-9b" --prompt "What is the capital of Mexico?"
 ```
 #### Quantized Versions through `bitsandbytes`
+<details>
+  <summary>
+    Using 8-bit precision (int8)
+  </summary>
 ```python
 # pip install bitsandbytes accelerate
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b",
+    quantization_config=quantization_config,
+)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
 print(tokenizer.decode(outputs[0]))
 ```
+</details>
+<details>
+  <summary>
+    Using 4-bit precision
+  </summary>
 ```python
 # pip install bitsandbytes accelerate
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
 model = AutoModelForCausalLM.from_pretrained(
     "google/gemma-2-9b",
+    quantization_config=quantization_config,
+)
 input_text = "Write me a poem about Machine Learning."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(**input_ids, max_new_tokens=32)
 print(tokenizer.decode(outputs[0]))
 ```
+</details>
+#### Advanced Usage
+<details>
+  <summary>
+    Torch compile
+  </summary>
+[Torch compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) is a method for speeding-up the
+inference of PyTorch modules. The Gemma-2 model can be run up to 6x faster by leveraging torch compile.
+Note that two warm-up steps are required before the full inference speed is realised:
+```python
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from transformers import AutoTokenizer, Gemma2ForCausalLM
+from transformers.cache_utils import HybridCache
+import torch
+torch.set_float32_matmul_precision("high")
+# load the model + tokenizer
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
+model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b", torch_dtype=torch.bfloat16)
+model.to("cuda")
+# apply the torch compile transformation
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+# pre-process inputs
+input_text = "The theory of special relativity states "
+model_inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+prompt_length = model_inputs.input_ids.shape[1]
+# set-up k/v cache
+past_key_values = HybridCache(
+    config=model.config,
+    max_batch_size=1,
+    max_cache_len=model.config.max_position_embeddings,
+    device=model.device,
+    dtype=model.dtype
+)
+# enable passing kv cache to generate
+model._supports_cache_class = True
+model.generation_config.cache_implementation = None
+# two warm-up steps
+for idx in range(2):
+    outputs = model.generate(**model_inputs, past_key_values=past_key_values, do_sample=True, temperature=1.0, max_new_tokens=128)
+    past_key_values.reset()
+# fast run
+outputs = model.generate(**model_inputs, past_key_values=past_key_values, do_sample=True, temperature=1.0, max_new_tokens=128)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
+For more details, refer to the [Transformers documentation](https://huggingface.co/docs/transformers/main/en/llm_optims?static-kv=basic+usage%3A+generation_config).
+</details>
 ### Inputs and outputs
 *   **Input:** Text string, such as a question, a prompt, or a document to be

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7da53ca29fb16f6b2489482fc0bc6a394162cdab14d12764a1755ebc583fea79
-size 17518525

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922
+size 17525357