Fixed call of Flash Attention implementation
#9
by
Dana-Farber
- opened
README.md
CHANGED
@@ -137,7 +137,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
137 |
"stabilityai/stable-code-3b",
|
138 |
trust_remote_code=True,
|
139 |
torch_dtype="auto",
|
140 |
-
|
141 |
)
|
142 |
model.cuda()
|
143 |
inputs = tokenizer("<fim_prefix>def fib(n):<fim_suffix> else:\n return fib(n - 2) + fib(n - 1)<fim_middle>", return_tensors="pt").to(model.device)
|
@@ -164,7 +164,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
164 |
"stabilityai/stable-code-3b",
|
165 |
trust_remote_code=True,
|
166 |
torch_dtype="auto",
|
167 |
-
|
168 |
)
|
169 |
model.cuda()
|
170 |
inputs = tokenizer("import torch\nimport torch.nn as nn", return_tensors="pt").to(model.device)
|
|
|
137 |
"stabilityai/stable-code-3b",
|
138 |
trust_remote_code=True,
|
139 |
torch_dtype="auto",
|
140 |
+
attn_implementation="flash_attention_2",
|
141 |
)
|
142 |
model.cuda()
|
143 |
inputs = tokenizer("<fim_prefix>def fib(n):<fim_suffix> else:\n return fib(n - 2) + fib(n - 1)<fim_middle>", return_tensors="pt").to(model.device)
|
|
|
164 |
"stabilityai/stable-code-3b",
|
165 |
trust_remote_code=True,
|
166 |
torch_dtype="auto",
|
167 |
+
attn_implementation="flash_attention_2",
|
168 |
)
|
169 |
model.cuda()
|
170 |
inputs = tokenizer("import torch\nimport torch.nn as nn", return_tensors="pt").to(model.device)
|