README.md · Crystalcareai/GemMoE-Medium-V0.5 at bde282e6446d74d5cf394c7e3b3be588ca1fdb8d

from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM

model_path = "Crystalcareai/GemMoE-Medium-v0.5"

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2"
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

# Convert prompt to tokens
prompt_template = "[INST] {prompt} [/INST]"

prompt = "You're standing on the surface of the Earth. "\
        "You walk one mile south, one mile west and one mile north. "\
        "You end up exactly where you started. Where are you?"

tokens = tokenizer(
    prompt_template.format(prompt=prompt), 
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    tokens, 
    streamer=streamer,
    max_new_tokens=512
)```