Spaces:

wondervictor
/

ControlAR

Running on Zero

App Files Files Community

wondervictor commited on Oct 31, 2024

Commit

876dc56

•

1 Parent(s): fc81a43

update

Browse files

Files changed (7) hide show

.gitignore +5 -0
app.py +3 -2
app_canny.py +12 -12
autoregressive/models/gpt_t2i.py +0 -2
checkpoints/flan-t5-xl/flan-t5-xl/spiece.model +3 -0
language/t5.py +6 -4
model.py +11 -5

.gitignore CHANGED Viewed

@@ -154,6 +154,11 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore

 # Cython debug symbols
 cython_debug/
+*.safetensors
+*.lock
+*.bin
+*.pt
+*.json
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ DESCRIPTION = "# [ControlAR: Controllable Image Generation with Autoregressive M
 SHOW_DUPLICATE_BUTTON = os.getenv("SHOW_DUPLICATE_BUTTON") == "1"
 model = Model()
 device = "cuda"
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     gr.DuplicateButton(
@@ -26,8 +27,8 @@ with gr.Blocks(css="style.css") as demo:
         visible=SHOW_DUPLICATE_BUTTON,
     )
     with gr.Tabs():
-        with gr.TabItem("Depth"):
-            create_demo_depth(model.process_depth)
         with gr.TabItem("Canny"):
             create_demo_canny(model.process_canny)

 SHOW_DUPLICATE_BUTTON = os.getenv("SHOW_DUPLICATE_BUTTON") == "1"
 model = Model()
 device = "cuda"
+model.to(device)
 with gr.Blocks(css="style.css") as demo:
     gr.Markdown(DESCRIPTION)
     gr.DuplicateButton(
         visible=SHOW_DUPLICATE_BUTTON,
     )
     with gr.Tabs():
+        # with gr.TabItem("Depth"):
+        #     create_demo_depth(model.process_depth)
         with gr.TabItem("Canny"):
             create_demo_canny(model.process_canny)

app_canny.py CHANGED Viewed

@@ -104,18 +104,18 @@ def create_demo(process):
             canny_low_threshold,
             canny_high_threshold,
         ]
-        prompt.submit(
-            fn=randomize_seed_fn,
-            inputs=[seed, randomize_seed],
-            outputs=seed,
-            queue=False,
-            api_name=False,
-        ).then(
-            fn=process,
-            inputs=inputs,
-            outputs=result,
-            api_name=False,
-        )
         run_button.click(
             fn=randomize_seed_fn,
             inputs=[seed, randomize_seed],

             canny_low_threshold,
             canny_high_threshold,
         ]
+        # prompt.submit(
+        #     fn=randomize_seed_fn,
+        #     inputs=[seed, randomize_seed],
+        #     outputs=seed,
+        #     queue=False,
+        #     api_name=False,
+        # ).then(
+        #     fn=process,
+        #     inputs=inputs,
+        #     outputs=result,
+        #     api_name=False,
+        # )
         run_button.click(
             fn=randomize_seed_fn,
             inputs=[seed, randomize_seed],

autoregressive/models/gpt_t2i.py CHANGED Viewed

@@ -375,8 +375,6 @@ class Transformer(nn.Module):
         # Zero-out output layers:
         nn.init.constant_(self.output.weight, 0)
     def _init_weights(self, module):
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):

         # Zero-out output layers:
         nn.init.constant_(self.output.weight, 0)
     def _init_weights(self, module):
         std = self.config.initializer_range
         if isinstance(module, nn.Linear):

checkpoints/flan-t5-xl/flan-t5-xl/spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
+size 791656

language/t5.py CHANGED Viewed

@@ -18,7 +18,7 @@ class T5Embedder:
     def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True,
                  t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120):
-        self.device = torch.device(device)
         self.torch_dtype = torch_dtype or torch.bfloat16
         if t5_model_kwargs is None:
             t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
@@ -53,6 +53,7 @@ class T5Embedder:
         print(tokenizer_path)
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
         self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
         self.model_max_length = model_max_length
     def get_text_embeddings(self, texts):
@@ -72,11 +73,12 @@ class T5Embedder:
         text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask']
         with torch.no_grad():
             text_encoder_embs = self.model(
-                input_ids=text_tokens_and_mask['input_ids'].to(self.device),
-                attention_mask=text_tokens_and_mask['attention_mask'].to(self.device),
             )['last_hidden_state'].detach()
-        return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.device)
     def text_preprocessing(self, text):
         if self.use_text_preprocessing:

     def __init__(self, device, dir_or_name='t5-v1_1-xxl', *, local_cache=False, cache_dir=None, hf_token=None, use_text_preprocessing=True,
                  t5_model_kwargs=None, torch_dtype=None, use_offload_folder=None, model_max_length=120):
+        self.device = torch.device('cuda:0')
         self.torch_dtype = torch_dtype or torch.bfloat16
         if t5_model_kwargs is None:
             t5_model_kwargs = {'low_cpu_mem_usage': True, 'torch_dtype': self.torch_dtype}
         print(tokenizer_path)
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
         self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
+        self.model.to('cuda')
         self.model_max_length = model_max_length
     def get_text_embeddings(self, texts):
         text_tokens_and_mask['attention_mask'] = text_tokens_and_mask['attention_mask']
         with torch.no_grad():
+            print("t5:", self.model.device)
             text_encoder_embs = self.model(
+                input_ids=text_tokens_and_mask['input_ids'].to(self.model.device),
+                attention_mask=text_tokens_and_mask['attention_mask'].to(self.model.device),
             )['last_hidden_state'].detach()
+        return text_encoder_embs, text_tokens_and_mask['attention_mask'].to(self.model.device)
     def text_preprocessing(self, text):
         if self.use_text_preprocessing:

model.py CHANGED Viewed

@@ -40,7 +40,7 @@ class Model:
     def __init__(self):
         self.device = torch.device(
-            "cuda:0" if torch.cuda.is_available() else "cpu")
         self.base_model_id = ""
         self.task_name = ""
         self.vq_model = self.load_vq()
@@ -48,12 +48,17 @@ class Model:
         self.gpt_model_canny = self.load_gpt(condition_type='canny')
         self.gpt_model_depth = self.load_gpt(condition_type='depth')
         self.get_control_canny = CannyDetector()
-        self.get_control_depth = MidasDetector(device=self.device)
     def load_vq(self):
         vq_model = VQ_models["VQ-16"](codebook_size=16384,
                                       codebook_embed_dim=8)
-        vq_model.to(self.device)
         vq_model.eval()
         checkpoint = torch.load(f"checkpoints/vq_ds16_t2i.pt",
                                 map_location="cpu")
@@ -71,7 +76,7 @@ class Model:
             cls_token_num=120,
             model_type='t2i',
             condition_type=condition_type,
-        ).to(device=self.device, dtype=precision)
         model_weight = load_file(gpt_ckpt)
         gpt_model.load_state_dict(model_weight, strict=False)
@@ -82,7 +87,7 @@ class Model:
     def load_t5(self):
         precision = torch.bfloat16
         t5_model = T5Embedder(
-            device=self.device,
             local_cache=False,
             cache_dir='checkpoints/flan-t5-xl',
             dir_or_name='flan-t5-xl',
@@ -134,6 +139,7 @@ class Model:
         c_emb_masks = new_emb_masks
         qzshape = [len(c_indices), 8, H // 16, W // 16]
         t1 = time.time()
         index_sample = generate(
             self.gpt_model_canny,
             c_indices,

     def __init__(self):
         self.device = torch.device(
+            "cuda:0")
         self.base_model_id = ""
         self.task_name = ""
         self.vq_model = self.load_vq()
         self.gpt_model_canny = self.load_gpt(condition_type='canny')
         self.gpt_model_depth = self.load_gpt(condition_type='depth')
         self.get_control_canny = CannyDetector()
+        self.get_control_depth = MidasDetector('cuda')
+    def to(self, device):
+        self.gpt_model_canny.to('cuda')
+        print(next(self.gpt_model_canny.adapter.parameters()).device)
+        # print(self.gpt_model_canny.device)
     def load_vq(self):
         vq_model = VQ_models["VQ-16"](codebook_size=16384,
                                       codebook_embed_dim=8)
+        vq_model.to('cuda')
         vq_model.eval()
         checkpoint = torch.load(f"checkpoints/vq_ds16_t2i.pt",
                                 map_location="cpu")
             cls_token_num=120,
             model_type='t2i',
             condition_type=condition_type,
+        ).to(device='cuda', dtype=precision)
         model_weight = load_file(gpt_ckpt)
         gpt_model.load_state_dict(model_weight, strict=False)
     def load_t5(self):
         precision = torch.bfloat16
         t5_model = T5Embedder(
+            device="cuda",
             local_cache=False,
             cache_dir='checkpoints/flan-t5-xl',
             dir_or_name='flan-t5-xl',
         c_emb_masks = new_emb_masks
         qzshape = [len(c_indices), 8, H // 16, W // 16]
         t1 = time.time()
+        print(caption_embs.device)
         index_sample = generate(
             self.gpt_model_canny,
             c_indices,