Crystalcareai
/

GemMoE-Medium-V0.5

Text Generation

Model card Files Files and versions Community

Crystalcareai commited on Mar 27

Commit

f0fff66

•

1 Parent(s): bde282e

Update README.md

Files changed (1) hide show

README.md +14 -8

README.md CHANGED Viewed

@@ -1,36 +1,42 @@
-```import torch
 from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
 model_path = "Crystalcareai/GemMoE-Medium-v0.5"
 # Load model
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
     device_map="auto",
     low_cpu_mem_usage=True,
     torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2"
     trust_remote_code=True,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 # Convert prompt to tokens
-prompt_template = "[INST] {prompt} [/INST]"
 prompt = "You're standing on the surface of the Earth. "\
-        "You walk one mile south, one mile west and one mile north. "\
-        "You end up exactly where you started. Where are you?"
 tokens = tokenizer(
-    prompt_template.format(prompt=prompt),
     return_tensors='pt'
 ).input_ids.cuda()
 # Generate output
 generation_output = model.generate(
-    tokens,
     streamer=streamer,
     max_new_tokens=512
-)```

+```python
+import torch
 from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM
 model_path = "Crystalcareai/GemMoE-Medium-v0.5"
 # Load model
 model = AutoModelForCausalLM.from_pretrained(
     model_path,
     device_map="auto",
     low_cpu_mem_usage=True,
     torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2",
     trust_remote_code=True,
 )
 tokenizer = AutoTokenizer.from_pretrained(model_path)
 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 # Convert prompt to tokens
+prompt_template = "[INST] {prompt} [/INST]"
 prompt = "You're standing on the surface of the Earth. "\
+         "You walk one mile south, one mile west and one mile north. "\
+         "You end up exactly where you started. Where are you?"
 tokens = tokenizer(
+    prompt_template.format(prompt=prompt),
     return_tensors='pt'
 ).input_ids.cuda()
 # Generate output
 generation_output = model.generate(
+    tokens,
     streamer=streamer,
     max_new_tokens=512
+)
+```