Added example to run on smaller GPUS
#1
by
buzzcraft
- opened
README.md
CHANGED
@@ -296,6 +296,53 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
296 |
tokenizer = AutoTokenizer.from_pretrained("norallm/normistral-7b-warm")
|
297 |
model = AutoModelForCausalLM.from_pretrained("norallm/normistral-7b-warm").cuda().eval()
|
298 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
# Now we will define the zero-shot prompt template
|
300 |
prompt = """Engelsk: {0}
|
301 |
Bokmål:"""
|
|
|
296 |
tokenizer = AutoTokenizer.from_pretrained("norallm/normistral-7b-warm")
|
297 |
model = AutoModelForCausalLM.from_pretrained("norallm/normistral-7b-warm").cuda().eval()
|
298 |
|
299 |
+
# Now we will define the zero-shot prompt template
|
300 |
+
prompt = """Engelsk: {0}
|
301 |
+
Bokmål:"""
|
302 |
+
|
303 |
+
# A function that will take care of generating the output
|
304 |
+
@torch.no_grad()
|
305 |
+
def generate(text):
|
306 |
+
text = prompt.format(text)
|
307 |
+
input_ids = tokenizer(text, return_tensors='pt').input_ids.cuda()
|
308 |
+
prediction = model.generate(
|
309 |
+
input_ids,
|
310 |
+
max_new_tokens=64,
|
311 |
+
do_sample=False,
|
312 |
+
eos_token_id=tokenizer('\n').input_ids
|
313 |
+
)
|
314 |
+
return tokenizer.decode(prediction[0, input_ids.size(1):]).strip()
|
315 |
+
|
316 |
+
# Now you can simply call the generate function with an English text you want to translate:
|
317 |
+
generate("I'm super excited about this Norwegian NORA model! Can it translate these sentences?")
|
318 |
+
# > this should output: 'Jeg er super spent på denne norske NORA modellen! Kan den oversette disse setningene?'
|
319 |
+
```
|
320 |
+
|
321 |
+
_____
|
322 |
+
## Example usage with low GPU usage
|
323 |
+
Install bitsandbytes if you want to load in 8bit
|
324 |
+
|
325 |
+
```python
|
326 |
+
pip install bitsandbytes
|
327 |
+
pip install accelerate
|
328 |
+
```
|
329 |
+
|
330 |
+
|
331 |
+
```python
|
332 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
333 |
+
import torch
|
334 |
+
|
335 |
+
# First, we will have to import the tokenizer and the language model
|
336 |
+
tokenizer = AutoTokenizer.from_pretrained("norallm/normistral-7b-warm")
|
337 |
+
model = AutoModelForCausalLM.from_pretrained("norallm/normistral-7b-warm",
|
338 |
+
device_map='auto',
|
339 |
+
load_in_8bit=True,
|
340 |
+
torch_dtype=torch.float16)
|
341 |
+
# This setup needs about 8gb VRAM
|
342 |
+
# Setting load_in_8bit = False, 15gb VRAM
|
343 |
+
# Using torch.float32 and load_in_8bit = False, 21gb VRAM
|
344 |
+
|
345 |
+
|
346 |
# Now we will define the zero-shot prompt template
|
347 |
prompt = """Engelsk: {0}
|
348 |
Bokmål:"""
|