Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,8 @@ from threading import Thread
|
|
9 |
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
11 |
MODEL_ID = "CohereForAI/aya-23-8B"
|
12 |
-
|
|
|
13 |
|
14 |
TITLE = "<h1><center>Aya-23-Chatbox</center></h1>"
|
15 |
|
@@ -34,26 +35,27 @@ USE_FLASH_ATTENTION = False
|
|
34 |
GRAD_ACC_STEPS = 16
|
35 |
|
36 |
quantization_config = None
|
|
|
37 |
if QUANTIZE_4BIT:
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
attn_implementation = None
|
46 |
if USE_FLASH_ATTENTION:
|
47 |
-
|
48 |
|
49 |
model = AutoModelForCausalLM.from_pretrained(
|
50 |
-
|
51 |
quantization_config=quantization_config,
|
52 |
attn_implementation=attn_implementation,
|
53 |
torch_dtype=torch.bfloat16,
|
54 |
device_map="auto",
|
55 |
)
|
56 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
57 |
|
58 |
@spaces.GPU
|
59 |
def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
|
|
|
9 |
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
11 |
MODEL_ID = "CohereForAI/aya-23-8B"
|
12 |
+
MODEL_ID2 = "CohereForAI/aya-23-35B"
|
13 |
+
MODEL_NAME = MODEL_ID2.split("/")[-1]
|
14 |
|
15 |
TITLE = "<h1><center>Aya-23-Chatbox</center></h1>"
|
16 |
|
|
|
35 |
GRAD_ACC_STEPS = 16
|
36 |
|
37 |
quantization_config = None
|
38 |
+
|
39 |
if QUANTIZE_4BIT:
|
40 |
+
quantization_config = BitsAndBytesConfig(
|
41 |
+
load_in_4bit=True,
|
42 |
+
bnb_4bit_quant_type="nf4",
|
43 |
+
bnb_4bit_use_double_quant=True,
|
44 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
45 |
+
)
|
46 |
|
47 |
attn_implementation = None
|
48 |
if USE_FLASH_ATTENTION:
|
49 |
+
attn_implementation="flash_attention_2"
|
50 |
|
51 |
model = AutoModelForCausalLM.from_pretrained(
|
52 |
+
MODEL_ID2,
|
53 |
quantization_config=quantization_config,
|
54 |
attn_implementation=attn_implementation,
|
55 |
torch_dtype=torch.bfloat16,
|
56 |
device_map="auto",
|
57 |
)
|
58 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID2)
|
59 |
|
60 |
@spaces.GPU
|
61 |
def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int):
|