masharpe commited on
Commit
44b21e0
·
1 Parent(s): 36113cf

Use ZeroGPU

Browse files
Files changed (5) hide show
  1. README.md +1 -1
  2. algorithms.py +3 -3
  3. app.py +19 -1
  4. load.py +9 -2
  5. requirements.txt +2 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🔗
4
  colorFrom: gray
5
  colorTo: gray
6
  sdk: gradio
7
- python_version: 3.11
8
  sdk_version: 4.41.0
9
  app_file: app.py
10
  short_description: Tightly pair LLM responses
 
4
  colorFrom: gray
5
  colorTo: gray
6
  sdk: gradio
7
+ python_version: 3.10
8
  sdk_version: 4.41.0
9
  app_file: app.py
10
  short_description: Tightly pair LLM responses
algorithms.py CHANGED
@@ -467,12 +467,12 @@ def _generate_streaming_impl(device, model, tokenizer, prompt_BL, max_tokens):
467
 
468
  # APOC unconditional streaming
469
  @torch.no_grad()
470
- def apoc_streaming(device, model_x, model_y, tokenizer, chat_x, chat_y, max_tokens=512, seed=None):
471
  if seed is not None:
472
  torch.manual_seed(seed)
473
 
474
- prompt_x_BL = tokenize_prompt(device, tokenizer, chat_x, quiet=True)
475
- prompt_y_BL = tokenize_prompt(device, tokenizer, chat_y, quiet=True)
476
  model_pair = ModelPair(model_x, model_y, prompt_x_BL, prompt_y_BL)
477
 
478
  logger.debug('PROMPT X:')
 
467
 
468
  # APOC unconditional streaming
469
  @torch.no_grad()
470
+ def apoc_streaming(model_x, model_y, tokenizer, chat_x, chat_y, max_tokens=512, seed=None):
471
  if seed is not None:
472
  torch.manual_seed(seed)
473
 
474
+ prompt_x_BL = tokenize_prompt(model_x.device, tokenizer, chat_x, quiet=True)
475
+ prompt_y_BL = tokenize_prompt(model_y.device, tokenizer, chat_y, quiet=True)
476
  model_pair = ModelPair(model_x, model_y, prompt_x_BL, prompt_y_BL)
477
 
478
  logger.debug('PROMPT X:')
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # Gradio demo of streaming generation of multiple LLM response pairs.
2
 
 
3
  import logging
4
  import time
5
  import html
@@ -7,6 +8,20 @@ import numpy as np
7
  import gradio as gr
8
  import util
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # gr.DataFrame is currently bugged for updating values,
11
  # so we must use raw HTML.
12
  # https://github.com/gradio-app/gradio/issues/8160
@@ -95,7 +110,11 @@ else:
95
  ]
96
  return chat
97
 
 
98
  def fn(max_tokens, num_responses, prompt_x, prompt_y):
 
 
 
99
  rows = [['']*2 for i in range(num_responses)]
100
  yield make_html_table(HEADERS, rows)
101
 
@@ -106,7 +125,6 @@ else:
106
  chat_y = make_chat(system_msg, prompt_y)
107
 
108
  gen = algorithms.apoc_streaming(
109
- 'cpu',
110
  model,
111
  model,
112
  tokenizer,
 
1
  # Gradio demo of streaming generation of multiple LLM response pairs.
2
 
3
+ import spaces
4
  import logging
5
  import time
6
  import html
 
8
  import gradio as gr
9
  import util
10
 
11
+ import huggingface_hub
12
+ import torch
13
+ import transformers
14
+ import accelerate
15
+
16
+ # For setting `requirements.txt`.
17
+ print('Dependency versions:')
18
+ print(f'huggingface_hub=={huggingface_hub.__version__}')
19
+ print(f'numpy=={np.__version__}')
20
+ print(f'torch=={torch.__version__}')
21
+ print(f'transformers=={transformers.__version__}')
22
+ print(f'accelerate=={accelerate.__version__}')
23
+ print()
24
+
25
  # gr.DataFrame is currently bugged for updating values,
26
  # so we must use raw HTML.
27
  # https://github.com/gradio-app/gradio/issues/8160
 
110
  ]
111
  return chat
112
 
113
+ @spaces.GPU
114
  def fn(max_tokens, num_responses, prompt_x, prompt_y):
115
+ # Is this necessary with ZeroGPU?
116
+ torch.use_deterministic_algorithms(True)
117
+
118
  rows = [['']*2 for i in range(num_responses)]
119
  yield make_html_table(HEADERS, rows)
120
 
 
125
  chat_y = make_chat(system_msg, prompt_y)
126
 
127
  gen = algorithms.apoc_streaming(
 
128
  model,
129
  model,
130
  tokenizer,
load.py CHANGED
@@ -6,9 +6,11 @@ import torch
6
  import transformers
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
8
 
9
- def load_model(repo_id, device_map=None, bnb=None, torch_dtype='auto'):
 
 
10
  # Try our best to get deterministic results.
11
- if device_map is not None:
12
  # For determinism with CUDA >= 10.2, PyTorch says to use one of these.
13
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
14
  #os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
@@ -35,6 +37,11 @@ def load_model(repo_id, device_map=None, bnb=None, torch_dtype='auto'):
35
  if bnb == 'nf4':
36
  bnb_config = BitsAndBytesConfig(load_in_4bit=True)
37
 
 
 
 
 
 
38
  model = AutoModelForCausalLM.from_pretrained(
39
  repo_id,
40
  torch_dtype=torch_dtype,
 
6
  import transformers
7
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
8
 
9
+ def load_model(repo_id, bnb=None, torch_dtype='auto'):
10
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
11
+
12
  # Try our best to get deterministic results.
13
+ if device.type == 'cuda':
14
  # For determinism with CUDA >= 10.2, PyTorch says to use one of these.
15
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
16
  #os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
 
37
  if bnb == 'nf4':
38
  bnb_config = BitsAndBytesConfig(load_in_4bit=True)
39
 
40
+ device_map = 'auto'
41
+ if device.type == 'cpu':
42
+ # BFloat16 is not supported on MPS
43
+ device_map = None
44
+
45
  model = AutoModelForCausalLM.from_pretrained(
46
  repo_id,
47
  torch_dtype=torch_dtype,
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  huggingface_hub==0.22.2
2
  numpy==1.26.4
3
  torch==2.2.2
4
- transformers==4.40.2
 
 
1
  huggingface_hub==0.22.2
2
  numpy==1.26.4
3
  torch==2.2.2
4
+ transformers==4.40.2
5
+ accelerate==0.31.0