Run it on Colab.

#4
by girrajjangid - opened
!pip -q install transformers==4.34.0 
!pip -q install accelerate==0.23.0
!pip -q install flash-attn==2.3.3 --no-build-isolation
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
import torch

model_id = "amazon/MistralLite"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype=torch.bfloat16,
                                             offload_folder = "offload", 
                                             device_map="auto")
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer)

prompt = "<|prompter|>What are the main challenges to support a long context for LLM? Explain in details 1000-2000 words.</s><|assistant|>"

sequences = pipeline(
    prompt,
    max_new_tokens=5000,
    do_sample=False,
    return_full_text=False,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

 for seq in sequences:
    print(f"{seq['generated_text']}")

flash_attn v2 not supported on T4 GPU.

Amazon Web Services org

Yes, in this case, we can run the model without flash_attn v2. Thank you!

flash_attn v2 not supported on T4 GPU.

Also, T4 doesn't support bfloat16

Amazon Web Services org

You can try to use float16, it should work as well. Cheers!

yinsong1986 changed discussion status to closed

Sign up or log in to comment