Spaces:
Running
Running
add flash-attn
Browse files- app.py +4 -4
- requirements.txt +1 -0
app.py
CHANGED
@@ -29,8 +29,6 @@ import modelscope_studio as mgr
|
|
29 |
os.system("pip list|grep torch")
|
30 |
os.system("pip list|grep trans")
|
31 |
os.system("pip list|grep flash")
|
32 |
-
os.system("nvidia-smi")
|
33 |
-
os.system("ll /usr/local/cuda*")
|
34 |
|
35 |
# Argparser
|
36 |
parser = argparse.ArgumentParser(description='demo')
|
@@ -46,7 +44,8 @@ if 'int4' in model_path:
|
|
46 |
if device == 'mps':
|
47 |
print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
|
48 |
exit()
|
49 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
|
|
|
50 |
else:
|
51 |
if args.multi_gpus:
|
52 |
from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
|
@@ -72,7 +71,8 @@ else:
|
|
72 |
|
73 |
model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
|
74 |
else:
|
75 |
-
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
|
|
76 |
model = model.to(device=device)
|
77 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
78 |
model.eval()
|
|
|
29 |
os.system("pip list|grep torch")
|
30 |
os.system("pip list|grep trans")
|
31 |
os.system("pip list|grep flash")
|
|
|
|
|
32 |
|
33 |
# Argparser
|
34 |
parser = argparse.ArgumentParser(description='demo')
|
|
|
44 |
if device == 'mps':
|
45 |
print('Error: running int4 model with bitsandbytes on Mac is not supported right now.')
|
46 |
exit()
|
47 |
+
#model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa')
|
48 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
|
49 |
else:
|
50 |
if args.multi_gpus:
|
51 |
from accelerate import load_checkpoint_and_dispatch, init_empty_weights, infer_auto_device_map
|
|
|
71 |
|
72 |
model = load_checkpoint_and_dispatch(model, model_path, dtype=torch.bfloat16, device_map=device_map)
|
73 |
else:
|
74 |
+
#model = AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
|
75 |
+
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
|
76 |
model = model.to(device=device)
|
77 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
78 |
model.eval()
|
requirements.txt
CHANGED
@@ -3,6 +3,7 @@ torch==2.1.2
|
|
3 |
torchvision==0.16.2
|
4 |
transformers==4.40.2
|
5 |
sentencepiece==0.1.99
|
|
|
6 |
opencv-python
|
7 |
decord
|
8 |
gradio==4.22.0
|
|
|
3 |
torchvision==0.16.2
|
4 |
transformers==4.40.2
|
5 |
sentencepiece==0.1.99
|
6 |
+
https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.2/flash_attn-2.6.2+cu123torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
7 |
opencv-python
|
8 |
decord
|
9 |
gradio==4.22.0
|