Spaces:

masharpe
/

all-prefix-optimal-coupling-demo-1

Sleeping

masharpe commited on Aug 10, 2024

Commit

44b21e0

1 Parent(s): 36113cf

Use ZeroGPU

Files changed (5) hide show

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔗
 colorFrom: gray
 colorTo: gray
 sdk: gradio
-python_version: 3.11
 sdk_version: 4.41.0
 app_file: app.py
 short_description: Tightly pair LLM responses

 colorFrom: gray
 colorTo: gray
 sdk: gradio
+python_version: 3.10
 sdk_version: 4.41.0
 app_file: app.py
 short_description: Tightly pair LLM responses

algorithms.py CHANGED Viewed

@@ -467,12 +467,12 @@ def _generate_streaming_impl(device, model, tokenizer, prompt_BL, max_tokens):
 # APOC unconditional streaming
 @torch.no_grad()
-def apoc_streaming(device, model_x, model_y, tokenizer, chat_x, chat_y, max_tokens=512, seed=None):
     if seed is not None:
         torch.manual_seed(seed)
-    prompt_x_BL = tokenize_prompt(device, tokenizer, chat_x, quiet=True)
-    prompt_y_BL = tokenize_prompt(device, tokenizer, chat_y, quiet=True)
     model_pair = ModelPair(model_x, model_y, prompt_x_BL, prompt_y_BL)
     logger.debug('PROMPT X:')

 # APOC unconditional streaming
 @torch.no_grad()
+def apoc_streaming(model_x, model_y, tokenizer, chat_x, chat_y, max_tokens=512, seed=None):
     if seed is not None:
         torch.manual_seed(seed)
+    prompt_x_BL = tokenize_prompt(model_x.device, tokenizer, chat_x, quiet=True)
+    prompt_y_BL = tokenize_prompt(model_y.device, tokenizer, chat_y, quiet=True)
     model_pair = ModelPair(model_x, model_y, prompt_x_BL, prompt_y_BL)
     logger.debug('PROMPT X:')

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 # Gradio demo of streaming generation of multiple LLM response pairs.
 import logging
 import time
 import html
@@ -7,6 +8,20 @@ import numpy as np
 import gradio as gr
 import util
 # gr.DataFrame is currently bugged for updating values,
 # so we must use raw HTML.
 # https://github.com/gradio-app/gradio/issues/8160
@@ -95,7 +110,11 @@ else:
         ]
         return chat
     def fn(max_tokens, num_responses, prompt_x, prompt_y):
         rows = [['']*2 for i in range(num_responses)]
         yield make_html_table(HEADERS, rows)
@@ -106,7 +125,6 @@ else:
             chat_y = make_chat(system_msg, prompt_y)
             gen = algorithms.apoc_streaming(
-                'cpu',
                 model,
                 model,
                 tokenizer,

 # Gradio demo of streaming generation of multiple LLM response pairs.
+import spaces
 import logging
 import time
 import html
 import gradio as gr
 import util
+import huggingface_hub
+import torch
+import transformers
+import accelerate
+# For setting `requirements.txt`.
+print('Dependency versions:')
+print(f'huggingface_hub=={huggingface_hub.__version__}')
+print(f'numpy=={np.__version__}')
+print(f'torch=={torch.__version__}')
+print(f'transformers=={transformers.__version__}')
+print(f'accelerate=={accelerate.__version__}')
+print()
 # gr.DataFrame is currently bugged for updating values,
 # so we must use raw HTML.
 # https://github.com/gradio-app/gradio/issues/8160
         ]
         return chat
+    @spaces.GPU
     def fn(max_tokens, num_responses, prompt_x, prompt_y):
+        # Is this necessary with ZeroGPU?
+        torch.use_deterministic_algorithms(True)
         rows = [['']*2 for i in range(num_responses)]
         yield make_html_table(HEADERS, rows)
             chat_y = make_chat(system_msg, prompt_y)
             gen = algorithms.apoc_streaming(
                 model,
                 model,
                 tokenizer,

load.py CHANGED Viewed

@@ -6,9 +6,11 @@ import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-def load_model(repo_id, device_map=None, bnb=None, torch_dtype='auto'):
     # Try our best to get deterministic results.
-    if device_map is not None:
         # For determinism with CUDA >= 10.2, PyTorch says to use one of these.
         os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
         #os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
@@ -35,6 +37,11 @@ def load_model(repo_id, device_map=None, bnb=None, torch_dtype='auto'):
     if bnb == 'nf4':
         bnb_config = BitsAndBytesConfig(load_in_4bit=True)
     model = AutoModelForCausalLM.from_pretrained(
         repo_id,
         torch_dtype=torch_dtype,

 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+def load_model(repo_id, bnb=None, torch_dtype='auto'):
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # Try our best to get deterministic results.
+    if device.type == 'cuda':
         # For determinism with CUDA >= 10.2, PyTorch says to use one of these.
         os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
         #os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16:8'
     if bnb == 'nf4':
         bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+    device_map = 'auto'
+    if device.type == 'cpu':
+        # BFloat16 is not supported on MPS
+        device_map = None
     model = AutoModelForCausalLM.from_pretrained(
         repo_id,
         torch_dtype=torch_dtype,

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 huggingface_hub==0.22.2
 numpy==1.26.4
 torch==2.2.2
-transformers==4.40.2

 huggingface_hub==0.22.2
 numpy==1.26.4
 torch==2.2.2
+transformers==4.40.2
+accelerate==0.31.0