liuhaotian commited on
Commit
7564980
1 Parent(s): c5721f2

Add flash attention

Browse files
Files changed (2) hide show
  1. app.py +7 -6
  2. requirements.txt +2 -2
app.py CHANGED
@@ -40,6 +40,7 @@ def start_worker(model_path: str, bits=16):
40
  model_path,
41
  "--model-name",
42
  model_name,
 
43
  ]
44
  if bits != 16:
45
  worker_command += [f"--load-{bits}bit"]
@@ -65,12 +66,12 @@ if __name__ == "__main__":
65
  ONLY WORKS WITH GPU! By default, we load the model with 4-bit quantization to make it fit in smaller hardwares. Set the environment variable `bits` to control the quantization.
66
 
67
  Set the environment variable `model` to change the model, and switch hardware accordingly:
68
- | Model | Hardware |
69
- |-------|-------------------|
70
- | liuhaotian/llava-v1.6-mistral-7b | T4-medium |
71
- | liuhaotian/llava-v1.6-vicuna-7b | T4-medium |
72
- | liuhaotian/llava-v1.6-vicuna-13b | T4-medium |
73
- | liuhaotian/llava-v1.6-34b | 2xA10G large |
74
  """
75
 
76
  print(f"args: {gws.args}")
 
40
  model_path,
41
  "--model-name",
42
  model_name,
43
+ "--use-flash-attn",
44
  ]
45
  if bits != 16:
46
  worker_command += [f"--load-{bits}bit"]
 
66
  ONLY WORKS WITH GPU! By default, we load the model with 4-bit quantization to make it fit in smaller hardwares. Set the environment variable `bits` to control the quantization.
67
 
68
  Set the environment variable `model` to change the model, and switch hardware accordingly:
69
+ | Model | Hardware |
70
+ |-----------------------------------|------------|
71
+ | liuhaotian/llava-v1.6-mistral-7b | T4 small |
72
+ | liuhaotian/llava-v1.6-vicuna-7b | T4 small |
73
+ | liuhaotian/llava-v1.6-vicuna-13b | T4 small |
74
+ | liuhaotian/llava-v1.6-34b | A10G large |
75
  """
76
 
77
  print(f"args: {gws.args}")
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
- llava-torch==1.2.1.post4
2
- protobuf==4.23.3
 
1
+ llava-torch
2
+ flash-attn