Anthonyg5005 commited on
Commit
f3cac53
·
1 Parent(s): 8effb79

change up setups

Browse files

didn't test linux but should work

auto-exl2-upload/auto-exl2-upload.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64bc897ec0699349f1f1d6e6a9cd9f2e4c8e94d4de6e45453603afe8f93f6803
3
- size 8403
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d112e7bf1d8f4f6f42c961edb46f89f8356ec5265798b493f3d6b55e2c994376
3
+ size 8585
auto-exl2-upload/linux-setup.sh CHANGED
@@ -40,7 +40,7 @@ fi
40
  read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
41
 
42
  # ask to install flash attention
43
- echo "Flash attention is a feature that could fix overflow issues on some more broken models."
44
  read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
45
  if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
46
  echo "Invalid input. Please enter y or n."
@@ -69,7 +69,6 @@ rm download-model.py
69
  rm -rf exllamav2
70
  rm start-quant.sh
71
  rm enter-venv.sh
72
- rm -rf flash-attention
73
 
74
  # download stuff
75
  echo "Downloading files"
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
87
  venv/bin/python -m pip install huggingface-hub transformers accelerate
88
  venv/bin/python -m pip install ./exllamav2
89
 
90
- if [ "$flash_attention" = "y" ]; then
91
- echo "Installing flash-attention..."
92
- echo "If failed, retry without flash-attention."
93
- git clone https://github.com/Dao-AILab/flash-attention
94
- venv/bin/python -m pip install ./flash-attention
95
- rm -rf flash-attention
96
- fi
97
 
98
  # create start-quant.sh
99
  echo "#!/bin/bash" > start-quant.sh
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
107
  echo "bash --init-file venv/bin/activate" >> enter-venv.sh
108
  chmod +x enter-venv.sh
109
 
 
 
 
 
 
 
 
 
 
110
  echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
111
  echo "Environment setup complete. run start-quant.sh to start the quantization process."
112
  read -p "Press enter to exit"
 
40
  read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
41
 
42
  # ask to install flash attention
43
+ echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
44
  read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
45
  if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
46
  echo "Invalid input. Please enter y or n."
 
69
  rm -rf exllamav2
70
  rm start-quant.sh
71
  rm enter-venv.sh
 
72
 
73
  # download stuff
74
  echo "Downloading files"
 
86
  venv/bin/python -m pip install huggingface-hub transformers accelerate
87
  venv/bin/python -m pip install ./exllamav2
88
 
89
+ echo "Writing shell files..."
 
 
 
 
 
 
90
 
91
  # create start-quant.sh
92
  echo "#!/bin/bash" > start-quant.sh
 
100
  echo "bash --init-file venv/bin/activate" >> enter-venv.sh
101
  chmod +x enter-venv.sh
102
 
103
+ if [ "$flash_attention" = "y" ]; then
104
+ echo "Going to attempt to install flash attention but it isn't required."
105
+ echo "You may close now if you'd like and continue without flash attention."
106
+ read -p "Press enter to continue and install flash attention"
107
+ echo "Get some popcorn and watch a movie, this will take a while."
108
+ echo "Installing flash-attn..."
109
+ venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
110
+ fi
111
+
112
  echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
113
  echo "Environment setup complete. run start-quant.sh to start the quantization process."
114
  read -p "Press enter to exit"
auto-exl2-upload/windows-setup.bat CHANGED
@@ -43,7 +43,7 @@ where nvcc
43
  set /p cuda_version="Please enter your CUDA version (11 or 12): "
44
 
45
  REM ask to install flash attention
46
- echo Flash attention is a feature that could fix overflow issues on some more broken models. However it will increase install time by a few hours.
47
  set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
48
  if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
49
  echo Invalid input. Please enter y or n.
@@ -69,7 +69,6 @@ del download-model.py
69
  rmdir /s /q exllamav2
70
  del start-quant.bat
71
  del enter-venv.bat
72
- rmdir /s /q flash-attention
73
 
74
  REM download stuff
75
  echo Downloading files...
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
87
  venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
88
  venv\scripts\python.exe -m pip install .\exllamav2
89
 
90
- if "%flash_attention%"=="y" (
91
- echo Installing flash-attention. Go watch some movies, this will take a while...
92
- echo If failed, retry without flash-attention.
93
- git clone https://github.com/Dao-AILab/flash-attention
94
- venv\scripts\python.exe -m pip install .\flash-attention
95
- rmdir /s /q flash-attention
96
- )
97
 
98
  REM create start-quant-windows.bat
99
  echo @echo off > start-quant.bat
@@ -106,6 +99,15 @@ REM create enter-venv.bat
106
  echo @echo off > enter-venv.bat
107
  echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
108
 
 
 
 
 
 
 
 
 
 
109
  powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
110
  echo Environment setup complete. run start-quant.bat to start the quantization process.
111
  pause
 
43
  set /p cuda_version="Please enter your CUDA version (11 or 12): "
44
 
45
  REM ask to install flash attention
46
+ echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
47
  set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
48
  if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
49
  echo Invalid input. Please enter y or n.
 
69
  rmdir /s /q exllamav2
70
  del start-quant.bat
71
  del enter-venv.bat
 
72
 
73
  REM download stuff
74
  echo Downloading files...
 
86
  venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
87
  venv\scripts\python.exe -m pip install .\exllamav2
88
 
89
+ echo Writing batch files...
 
 
 
 
 
 
90
 
91
  REM create start-quant-windows.bat
92
  echo @echo off > start-quant.bat
 
99
  echo @echo off > enter-venv.bat
100
  echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
101
 
102
+ if "%flash_attention%"=="y" (
103
+ echo Going to attempt to install flash attention but it isn't required.
104
+ echo You may close now if you'd like and continue without flash attention.
105
+ pause
106
+ echo Get some popcorn and watch a movie. This will take a while.
107
+ echo Installing flash-attn...
108
+ venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
109
+ )
110
+
111
  powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
112
  echo Environment setup complete. run start-quant.bat to start the quantization process.
113
  pause
exl2-multi-quant-local/exl2-multi-quant-local.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5bd8f0bfff817ece26c0fe1a0886c2851f761386b61fe5d53e69b080341a634a
3
- size 7226
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96d89522925670652ab7ea1d6152a4e64c15302a940c9753a37345f2e9a06e58
3
+ size 7408
exl2-multi-quant-local/linux-setup.sh CHANGED
@@ -40,7 +40,7 @@ fi
40
  read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
41
 
42
  # ask to install flash attention
43
- echo "Flash attention is a feature that could fix overflow issues on some more broken models."
44
  read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
45
  if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
46
  echo "Invalid input. Please enter y or n."
@@ -69,7 +69,6 @@ rm download-model.py
69
  rm -rf exllamav2
70
  rm start-quant.sh
71
  rm enter-venv.sh
72
- rm -rf flash-attention
73
 
74
  # download stuff
75
  echo "Downloading files"
@@ -87,13 +86,7 @@ venv/bin/python -m pip install -r exllamav2/requirements.txt
87
  venv/bin/python -m pip install huggingface-hub transformers accelerate
88
  venv/bin/python -m pip install ./exllamav2
89
 
90
- if [ "$flash_attention" = "y" ]; then
91
- echo "Installing flash-attention..."
92
- echo "If failed, retry without flash-attention."
93
- git clone https://github.com/Dao-AILab/flash-attention
94
- venv/bin/python -m pip install ./flash-attention
95
- rm -rf flash-attention
96
- fi
97
 
98
  # create start-quant.sh
99
  echo "#!/bin/bash" > start-quant.sh
@@ -107,6 +100,15 @@ echo "#!/bin/bash" > enter-venv.sh
107
  echo "bash --init-file venv/bin/activate" >> enter-venv.sh
108
  chmod +x enter-venv.sh
109
 
 
 
 
 
 
 
 
 
 
110
  echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
111
  echo "Environment setup complete. run start-quant.sh to start the quantization process."
112
  read -p "Press enter to exit"
 
40
  read -p "Please enter your GPU compute version, CUDA 11/12 or AMD ROCm (11, 12, rocm): " pytorch_version
41
 
42
  # ask to install flash attention
43
+ echo "Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours."
44
  read -p "Would you like to install flash-attention? (rarely needed and optional) (y/n) " flash_attention
45
  if [ "$flash_attention" != "y" ] && [ "$flash_attention" != "n" ]; then
46
  echo "Invalid input. Please enter y or n."
 
69
  rm -rf exllamav2
70
  rm start-quant.sh
71
  rm enter-venv.sh
 
72
 
73
  # download stuff
74
  echo "Downloading files"
 
86
  venv/bin/python -m pip install huggingface-hub transformers accelerate
87
  venv/bin/python -m pip install ./exllamav2
88
 
89
+ echo "Writing shell files..."
 
 
 
 
 
 
90
 
91
  # create start-quant.sh
92
  echo "#!/bin/bash" > start-quant.sh
 
100
  echo "bash --init-file venv/bin/activate" >> enter-venv.sh
101
  chmod +x enter-venv.sh
102
 
103
+ if [ "$flash_attention" = "y" ]; then
104
+ echo "Going to attempt to install flash attention but it isn't required."
105
+ echo "You may close now if you'd like and continue without flash attention."
106
+ read -p "Press enter to continue and install flash attention"
107
+ echo "Get some popcorn and watch a movie, this will take a while."
108
+ echo "Installing flash-attn..."
109
+ venv/bin/python -m pip install git+https://github.com/Dao-AILab/flash-attention.git
110
+ fi
111
+
112
  echo "If you use ctrl+c to stop, you may need to also use 'pkill python' to stop running scripts."
113
  echo "Environment setup complete. run start-quant.sh to start the quantization process."
114
  read -p "Press enter to exit"
exl2-multi-quant-local/windows-setup.bat CHANGED
@@ -43,7 +43,7 @@ where nvcc
43
  set /p cuda_version="Please enter your CUDA version (11 or 12): "
44
 
45
  REM ask to install flash attention
46
- echo Flash attention is a feature that could fix overflow issues on some more broken models. However it will increase install time by a few hours.
47
  set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
48
  if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
49
  echo Invalid input. Please enter y or n.
@@ -69,7 +69,6 @@ del download-model.py
69
  rmdir /s /q exllamav2
70
  del start-quant.bat
71
  del enter-venv.bat
72
- rmdir /s /q flash-attention
73
 
74
  REM download stuff
75
  echo Downloading files...
@@ -87,13 +86,7 @@ venv\scripts\python.exe -m pip install -r exllamav2/requirements.txt
87
  venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
88
  venv\scripts\python.exe -m pip install .\exllamav2
89
 
90
- if "%flash_attention%"=="y" (
91
- echo Installing flash-attention. Go watch some movies, this will take a while...
92
- echo If failed, retry without flash-attention.
93
- git clone https://github.com/Dao-AILab/flash-attention
94
- venv\scripts\python.exe -m pip install .\flash-attention
95
- rmdir /s /q flash-attention
96
- )
97
 
98
  REM create start-quant-windows.bat
99
  echo @echo off > start-quant.bat
@@ -106,6 +99,15 @@ REM create enter-venv.bat
106
  echo @echo off > enter-venv.bat
107
  echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
108
 
 
 
 
 
 
 
 
 
 
109
  powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
110
  echo Environment setup complete. run start-quant.bat to start the quantization process.
111
  pause
 
43
  set /p cuda_version="Please enter your CUDA version (11 or 12): "
44
 
45
  REM ask to install flash attention
46
+ echo Flash attention is a feature that could fix overflow issues on some more broken models, however, it will increase install time by a few hours.
47
  set /p flash_attention="Would you like to install flash-attention? (rarely needed and optional) (y/n) "
48
  if not "%flash_attention%"=="y" if not "%flash_attention%"=="n" (
49
  echo Invalid input. Please enter y or n.
 
69
  rmdir /s /q exllamav2
70
  del start-quant.bat
71
  del enter-venv.bat
 
72
 
73
  REM download stuff
74
  echo Downloading files...
 
86
  venv\scripts\python.exe -m pip install huggingface-hub transformers accelerate
87
  venv\scripts\python.exe -m pip install .\exllamav2
88
 
89
+ echo Writing batch files...
 
 
 
 
 
 
90
 
91
  REM create start-quant-windows.bat
92
  echo @echo off > start-quant.bat
 
99
  echo @echo off > enter-venv.bat
100
  echo cmd /k call venv\scripts\activate.bat >> enter-venv.bat
101
 
102
+ if "%flash_attention%"=="y" (
103
+ echo Going to attempt to install flash attention but it isn't required.
104
+ echo You may close now if you'd like and continue without flash attention.
105
+ pause
106
+ echo Get some popcorn and watch a movie. This will take a while.
107
+ echo Installing flash-attn...
108
+ venv\scripts\python.exe -m pip install git+https://github.com/Dao-AILab/flash-attention.git
109
+ )
110
+
111
  powershell -c (New-Object Media.SoundPlayer "C:\Windows\Media\tada.wav").PlaySync();
112
  echo Environment setup complete. run start-quant.bat to start the quantization process.
113
  pause