Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -2,14 +2,14 @@ import os
|
|
2 |
import time
|
3 |
import spaces
|
4 |
import torch
|
5 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
|
6 |
import gradio as gr
|
7 |
from threading import Thread
|
8 |
|
9 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
10 |
-
MODEL = "
|
11 |
|
12 |
-
TITLE = "<h1><center>
|
13 |
|
14 |
PLACEHOLDER = """
|
15 |
<center>
|
@@ -35,9 +35,11 @@ device = "cuda" # for GPU usage or "cpu" for CPU usage
|
|
35 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
36 |
model = AutoModelForCausalLM.from_pretrained(
|
37 |
MODEL,
|
38 |
-
torch_dtype=torch.
|
39 |
device_map="auto",
|
40 |
-
|
|
|
|
|
41 |
|
42 |
@spaces.GPU()
|
43 |
def stream_chat(
|
@@ -74,7 +76,7 @@ def stream_chat(
|
|
74 |
do_sample = False,
|
75 |
top_p = top_p,
|
76 |
top_k = top_k,
|
77 |
-
eos_token_id=
|
78 |
temperature = temperature,
|
79 |
repetition_penalty=penalty,
|
80 |
streamer=streamer,
|
|
|
2 |
import time
|
3 |
import spaces
|
4 |
import torch
|
5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, TextIteratorStreamer
|
6 |
import gradio as gr
|
7 |
from threading import Thread
|
8 |
|
9 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
10 |
+
MODEL = "NousResearch/Hermes-3-Llama-3.2-3B"
|
11 |
|
12 |
+
TITLE = "<h1><center>Hermes-3-Llama-3.2-3B</center></h1>"
|
13 |
|
14 |
PLACEHOLDER = """
|
15 |
<center>
|
|
|
35 |
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
36 |
model = AutoModelForCausalLM.from_pretrained(
|
37 |
MODEL,
|
38 |
+
torch_dtype=torch.float16,
|
39 |
device_map="auto",
|
40 |
+
load_in_8bit=False,
|
41 |
+
load_in_4bit=True,
|
42 |
+
use_flash_attention_2=True)
|
43 |
|
44 |
@spaces.GPU()
|
45 |
def stream_chat(
|
|
|
76 |
do_sample = False,
|
77 |
top_p = top_p,
|
78 |
top_k = top_k,
|
79 |
+
eos_token_id = 128039,
|
80 |
temperature = temperature,
|
81 |
repetition_penalty=penalty,
|
82 |
streamer=streamer,
|