Use ZeroGPU
Browse files- README.md +1 -1
- algorithms.py +3 -3
- app.py +19 -1
- load.py +9 -2
- requirements.txt +2 -1
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🔗
|
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
python_version: 3.
|
8 |
sdk_version: 4.41.0
|
9 |
app_file: app.py
|
10 |
short_description: Tightly pair LLM responses
|
|
|
4 |
colorFrom: gray
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
python_version: 3.10
|
8 |
sdk_version: 4.41.0
|
9 |
app_file: app.py
|
10 |
short_description: Tightly pair LLM responses
|
algorithms.py
CHANGED
@@ -467,12 +467,12 @@ def _generate_streaming_impl(device, model, tokenizer, prompt_BL, max_tokens):
|
|
467 |
|
468 |
# APOC unconditional streaming
|
469 |
@torch.no_grad()
|
470 |
-
def apoc_streaming(
|
471 |
if seed is not None:
|
472 |
torch.manual_seed(seed)
|
473 |
|
474 |
-
prompt_x_BL = tokenize_prompt(device, tokenizer, chat_x, quiet=True)
|
475 |
-
prompt_y_BL = tokenize_prompt(device, tokenizer, chat_y, quiet=True)
|
476 |
model_pair = ModelPair(model_x, model_y, prompt_x_BL, prompt_y_BL)
|
477 |
|
478 |
logger.debug('PROMPT X:')
|
|
|
467 |
|
468 |
# APOC unconditional streaming
|
469 |
@torch.no_grad()
|
470 |
+
def apoc_streaming(model_x, model_y, tokenizer, chat_x, chat_y, max_tokens=512, seed=None):
|
471 |
if seed is not None:
|
472 |
torch.manual_seed(seed)
|
473 |
|
474 |
+
prompt_x_BL = tokenize_prompt(model_x.device, tokenizer, chat_x, quiet=True)
|
475 |
+
prompt_y_BL = tokenize_prompt(model_y.device, tokenizer, chat_y, quiet=True)
|
476 |
model_pair = ModelPair(model_x, model_y, prompt_x_BL, prompt_y_BL)
|
477 |
|
478 |
logger.debug('PROMPT X:')
|
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
# Gradio demo of streaming generation of multiple LLM response pairs.
|
2 |
|
|
|
3 |
import logging
|
4 |
import time
|
5 |
import html
|
@@ -7,6 +8,20 @@ import numpy as np
|
|
7 |
import gradio as gr
|
8 |
import util
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
# gr.DataFrame is currently bugged for updating values,
|
11 |
# so we must use raw HTML.
|
12 |
# https://github.com/gradio-app/gradio/issues/8160
|
@@ -95,7 +110,11 @@ else:
|
|
95 |
]
|
96 |
return chat
|
97 |
|
|
|
98 |
def fn(max_tokens, num_responses, prompt_x, prompt_y):
|
|
|
|
|
|
|
99 |
rows = [['']*2 for i in range(num_responses)]
|
100 |
yield make_html_table(HEADERS, rows)
|
101 |
|
@@ -106,7 +125,6 @@ else:
|
|
106 |
chat_y = make_chat(system_msg, prompt_y)
|
107 |
|
108 |
gen = algorithms.apoc_streaming(
|
109 |
-
'cpu',
|
110 |
model,
|
111 |
model,
|
112 |
tokenizer,
|
|
|
1 |
# Gradio demo of streaming generation of multiple LLM response pairs.
|
2 |
|
3 |
+
import spaces
|
4 |
import logging
|
5 |
import time
|
6 |
import html
|
|
|
8 |
import gradio as gr
|
9 |
import util
|
10 |
|
11 |
+
import huggingface_hub
|
12 |
+
import torch
|
13 |
+
import transformers
|
14 |
+
import accelerate
|
15 |
+
|
16 |
+
# For setting `requirements.txt`.
|
17 |
+
print('Dependency versions:')
|
18 |
+
print(f'huggingface_hub=={huggingface_hub.__version__}')
|
19 |
+
print(f'numpy=={np.__version__}')
|
20 |
+
print(f'torch=={torch.__version__}')
|
21 |
+
print(f'transformers=={transformers.__version__}')
|
22 |
+
print(f'accelerate=={accelerate.__version__}')
|
23 |
+
print()
|
24 |
+
|
25 |
# gr.DataFrame is currently bugged for updating values,
|
26 |
# so we must use raw HTML.
|
27 |
# https://github.com/gradio-app/gradio/issues/8160
|
|
|
110 |
]
|
111 |
return chat
|
112 |
|
113 |
+
@spaces.GPU
|
114 |
def fn(max_tokens, num_responses, prompt_x, prompt_y):
|
115 |
+
# Is this necessary with ZeroGPU?
|
116 |
+
torch.use_deterministic_algorithms(True)
|
117 |
+
|
118 |
rows = [['']*2 for i in range(num_responses)]
|
119 |
yield make_html_table(HEADERS, rows)
|
120 |
|
|
|
125 |
chat_y = make_chat(system_msg, prompt_y)
|
126 |
|
127 |
gen = algorithms.apoc_streaming(
|
|
|
128 |
model,
|
129 |
model,
|
130 |
tokenizer,
|
load.py
CHANGED
@@ -6,9 +6,11 @@ import torch
|
|
6 |
import transformers
|
7 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
8 |
|
9 |
-
def load_model(repo_id,
|
|
|
|
|
10 |
# Try our best to get deterministic results.
|
11 |
-
if
|
12 |
# For determinism with CUDA >= 10.2, PyTorch says to use one of these.
|
13 |
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
|
14 |
#os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
|
@@ -35,6 +37,11 @@ def load_model(repo_id, device_map=None, bnb=None, torch_dtype='auto'):
|
|
35 |
if bnb == 'nf4':
|
36 |
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
37 |
|
|
|
|
|
|
|
|
|
|
|
38 |
model = AutoModelForCausalLM.from_pretrained(
|
39 |
repo_id,
|
40 |
torch_dtype=torch_dtype,
|
|
|
6 |
import transformers
|
7 |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
8 |
|
9 |
+
def load_model(repo_id, bnb=None, torch_dtype='auto'):
|
10 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
11 |
+
|
12 |
# Try our best to get deterministic results.
|
13 |
+
if device.type == 'cuda':
|
14 |
# For determinism with CUDA >= 10.2, PyTorch says to use one of these.
|
15 |
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
|
16 |
#os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
|
|
|
37 |
if bnb == 'nf4':
|
38 |
bnb_config = BitsAndBytesConfig(load_in_4bit=True)
|
39 |
|
40 |
+
device_map = 'auto'
|
41 |
+
if device.type == 'cpu':
|
42 |
+
# BFloat16 is not supported on MPS
|
43 |
+
device_map = None
|
44 |
+
|
45 |
model = AutoModelForCausalLM.from_pretrained(
|
46 |
repo_id,
|
47 |
torch_dtype=torch_dtype,
|
requirements.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
huggingface_hub==0.22.2
|
2 |
numpy==1.26.4
|
3 |
torch==2.2.2
|
4 |
-
transformers==4.40.2
|
|
|
|
1 |
huggingface_hub==0.22.2
|
2 |
numpy==1.26.4
|
3 |
torch==2.2.2
|
4 |
+
transformers==4.40.2
|
5 |
+
accelerate==0.31.0
|