dylanebert HF staff commited on
Commit
161f2ca
1 Parent(s): 016512c

update with instantmesh pipeline

Browse files
model_index.json CHANGED
@@ -106,8 +106,8 @@
106
  "CLIPTokenizer"
107
  ],
108
  "unet": [
109
- "diffusers",
110
- "UNet2DConditionModel"
111
  ],
112
  "vae": [
113
  "diffusers",
 
106
  "CLIPTokenizer"
107
  ],
108
  "unet": [
109
+ null,
110
+ null
111
  ],
112
  "vae": [
113
  "diffusers",
pipeline.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, Optional
2
+ from diffusers.schedulers import KarrasDiffusionSchedulers
3
+
4
+ import numpy
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.utils.checkpoint
8
+ import torch.distributed
9
+ import transformers
10
+ from collections import OrderedDict
11
+ from PIL import Image
12
+ from torchvision import transforms
13
+ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
14
+ from diffusers.utils import BaseOutput
15
+
16
+ import rembg
17
+ from torchvision.transforms import v2
18
+
19
+ import diffusers
20
+ from diffusers import (
21
+ AutoencoderKL,
22
+ DDPMScheduler,
23
+ DiffusionPipeline,
24
+ EulerAncestralDiscreteScheduler,
25
+ UNet2DConditionModel,
26
+ )
27
+ from diffusers.image_processor import VaeImageProcessor
28
+ from diffusers.models.attention_processor import (
29
+ Attention,
30
+ AttnProcessor,
31
+ XFormersAttnProcessor,
32
+ AttnProcessor2_0,
33
+ )
34
+ from diffusers.utils.import_utils import is_xformers_available
35
+
36
+
37
+ def to_rgb_image(maybe_rgba: Image.Image):
38
+ if maybe_rgba.mode == "RGB":
39
+ return maybe_rgba
40
+ elif maybe_rgba.mode == "RGBA":
41
+ rgba = maybe_rgba
42
+ img = numpy.random.randint(
43
+ 255, 256, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8
44
+ )
45
+ img = Image.fromarray(img, "RGB")
46
+ img.paste(rgba, mask=rgba.getchannel("A"))
47
+ return img
48
+ else:
49
+ raise ValueError("Unsupported image type.", maybe_rgba.mode)
50
+
51
+
52
+ class ReferenceOnlyAttnProc(torch.nn.Module):
53
+ def __init__(self, chained_proc, enabled=False, name=None) -> None:
54
+ super().__init__()
55
+ self.enabled = enabled
56
+ self.chained_proc = chained_proc
57
+ self.name = name
58
+
59
+ def __call__(
60
+ self,
61
+ attn: Attention,
62
+ hidden_states,
63
+ encoder_hidden_states=None,
64
+ attention_mask=None,
65
+ mode="w",
66
+ ref_dict: dict = None,
67
+ is_cfg_guidance=False,
68
+ ) -> Any:
69
+ if encoder_hidden_states is None:
70
+ encoder_hidden_states = hidden_states
71
+ if self.enabled and is_cfg_guidance:
72
+ res0 = self.chained_proc(
73
+ attn, hidden_states[:1], encoder_hidden_states[:1], attention_mask
74
+ )
75
+ hidden_states = hidden_states[1:]
76
+ encoder_hidden_states = encoder_hidden_states[1:]
77
+ if self.enabled:
78
+ if mode == "w":
79
+ ref_dict[self.name] = encoder_hidden_states
80
+ elif mode == "r":
81
+ encoder_hidden_states = torch.cat(
82
+ [encoder_hidden_states, ref_dict.pop(self.name)], dim=1
83
+ )
84
+ elif mode == "m":
85
+ encoder_hidden_states = torch.cat(
86
+ [encoder_hidden_states, ref_dict[self.name]], dim=1
87
+ )
88
+ else:
89
+ assert False, mode
90
+ res = self.chained_proc(
91
+ attn, hidden_states, encoder_hidden_states, attention_mask
92
+ )
93
+ if self.enabled and is_cfg_guidance:
94
+ res = torch.cat([res0, res])
95
+ return res
96
+
97
+
98
+ class RefOnlyNoisedUNet(torch.nn.Module):
99
+ def __init__(
100
+ self,
101
+ unet: UNet2DConditionModel,
102
+ train_sched: DDPMScheduler,
103
+ val_sched: EulerAncestralDiscreteScheduler,
104
+ ) -> None:
105
+ super().__init__()
106
+ self.unet = unet
107
+ self.train_sched = train_sched
108
+ self.val_sched = val_sched
109
+
110
+ unet_lora_attn_procs = dict()
111
+ for name, _ in unet.attn_processors.items():
112
+ if torch.__version__ >= "2.0":
113
+ default_attn_proc = AttnProcessor2_0()
114
+ elif is_xformers_available():
115
+ default_attn_proc = XFormersAttnProcessor()
116
+ else:
117
+ default_attn_proc = AttnProcessor()
118
+ unet_lora_attn_procs[name] = ReferenceOnlyAttnProc(
119
+ default_attn_proc, enabled=name.endswith("attn1.processor"), name=name
120
+ )
121
+ unet.set_attn_processor(unet_lora_attn_procs)
122
+
123
+ def __getattr__(self, name: str):
124
+ try:
125
+ return super().__getattr__(name)
126
+ except AttributeError:
127
+ return getattr(self.unet, name)
128
+
129
+ def forward_cond(
130
+ self,
131
+ noisy_cond_lat,
132
+ timestep,
133
+ encoder_hidden_states,
134
+ class_labels,
135
+ ref_dict,
136
+ is_cfg_guidance,
137
+ **kwargs,
138
+ ):
139
+ if is_cfg_guidance:
140
+ encoder_hidden_states = encoder_hidden_states[1:]
141
+ class_labels = class_labels[1:]
142
+ self.unet(
143
+ noisy_cond_lat,
144
+ timestep,
145
+ encoder_hidden_states=encoder_hidden_states,
146
+ class_labels=class_labels,
147
+ cross_attention_kwargs=dict(mode="w", ref_dict=ref_dict),
148
+ **kwargs,
149
+ )
150
+
151
+ def forward(
152
+ self,
153
+ sample,
154
+ timestep,
155
+ encoder_hidden_states,
156
+ class_labels=None,
157
+ *args,
158
+ cross_attention_kwargs,
159
+ down_block_res_samples=None,
160
+ mid_block_res_sample=None,
161
+ **kwargs,
162
+ ):
163
+ cond_lat = cross_attention_kwargs["cond_lat"]
164
+ is_cfg_guidance = cross_attention_kwargs.get("is_cfg_guidance", False)
165
+ noise = torch.randn_like(cond_lat)
166
+ if self.training:
167
+ noisy_cond_lat = self.train_sched.add_noise(cond_lat, noise, timestep)
168
+ noisy_cond_lat = self.train_sched.scale_model_input(
169
+ noisy_cond_lat, timestep
170
+ )
171
+ else:
172
+ noisy_cond_lat = self.val_sched.add_noise(
173
+ cond_lat, noise, timestep.reshape(-1)
174
+ )
175
+ noisy_cond_lat = self.val_sched.scale_model_input(
176
+ noisy_cond_lat, timestep.reshape(-1)
177
+ )
178
+ ref_dict = {}
179
+ self.forward_cond(
180
+ noisy_cond_lat,
181
+ timestep,
182
+ encoder_hidden_states,
183
+ class_labels,
184
+ ref_dict,
185
+ is_cfg_guidance,
186
+ **kwargs,
187
+ )
188
+ weight_dtype = self.unet.dtype
189
+ return self.unet(
190
+ sample,
191
+ timestep,
192
+ encoder_hidden_states,
193
+ *args,
194
+ class_labels=class_labels,
195
+ cross_attention_kwargs=dict(
196
+ mode="r", ref_dict=ref_dict, is_cfg_guidance=is_cfg_guidance
197
+ ),
198
+ down_block_additional_residuals=(
199
+ [sample.to(dtype=weight_dtype) for sample in down_block_res_samples]
200
+ if down_block_res_samples is not None
201
+ else None
202
+ ),
203
+ mid_block_additional_residual=(
204
+ mid_block_res_sample.to(dtype=weight_dtype)
205
+ if mid_block_res_sample is not None
206
+ else None
207
+ ),
208
+ **kwargs,
209
+ )
210
+
211
+
212
+ def scale_latents(latents):
213
+ latents = (latents - 0.22) * 0.75
214
+ return latents
215
+
216
+
217
+ def unscale_latents(latents):
218
+ latents = latents / 0.75 + 0.22
219
+ return latents
220
+
221
+
222
+ def scale_image(image):
223
+ image = image * 0.5 / 0.8
224
+ return image
225
+
226
+
227
+ def unscale_image(image):
228
+ image = image / 0.5 * 0.8
229
+ return image
230
+
231
+
232
+ class DepthControlUNet(torch.nn.Module):
233
+ def __init__(
234
+ self,
235
+ unet: RefOnlyNoisedUNet,
236
+ controlnet: Optional[diffusers.ControlNetModel] = None,
237
+ conditioning_scale=1.0,
238
+ ) -> None:
239
+ super().__init__()
240
+ self.unet = unet
241
+ if controlnet is None:
242
+ self.controlnet = diffusers.ControlNetModel.from_unet(unet.unet)
243
+ else:
244
+ self.controlnet = controlnet
245
+ DefaultAttnProc = AttnProcessor2_0
246
+ if is_xformers_available():
247
+ DefaultAttnProc = XFormersAttnProcessor
248
+ self.controlnet.set_attn_processor(DefaultAttnProc())
249
+ self.conditioning_scale = conditioning_scale
250
+
251
+ def __getattr__(self, name: str):
252
+ try:
253
+ return super().__getattr__(name)
254
+ except AttributeError:
255
+ return getattr(self.unet, name)
256
+
257
+ def forward(
258
+ self,
259
+ sample,
260
+ timestep,
261
+ encoder_hidden_states,
262
+ class_labels=None,
263
+ *args,
264
+ cross_attention_kwargs: dict,
265
+ **kwargs,
266
+ ):
267
+ cross_attention_kwargs = dict(cross_attention_kwargs)
268
+ control_depth = cross_attention_kwargs.pop("control_depth")
269
+ down_block_res_samples, mid_block_res_sample = self.controlnet(
270
+ sample,
271
+ timestep,
272
+ encoder_hidden_states=encoder_hidden_states,
273
+ controlnet_cond=control_depth,
274
+ conditioning_scale=self.conditioning_scale,
275
+ return_dict=False,
276
+ )
277
+ return self.unet(
278
+ sample,
279
+ timestep,
280
+ encoder_hidden_states=encoder_hidden_states,
281
+ down_block_res_samples=down_block_res_samples,
282
+ mid_block_res_sample=mid_block_res_sample,
283
+ cross_attention_kwargs=cross_attention_kwargs,
284
+ )
285
+
286
+
287
+ class ModuleListDict(torch.nn.Module):
288
+ def __init__(self, procs: dict) -> None:
289
+ super().__init__()
290
+ self.keys = sorted(procs.keys())
291
+ self.values = torch.nn.ModuleList(procs[k] for k in self.keys)
292
+
293
+ def __getitem__(self, key):
294
+ return self.values[self.keys.index(key)]
295
+
296
+
297
+ class SuperNet(torch.nn.Module):
298
+ def __init__(self, state_dict: Dict[str, torch.Tensor]):
299
+ super().__init__()
300
+ state_dict = OrderedDict((k, state_dict[k]) for k in sorted(state_dict.keys()))
301
+ self.layers = torch.nn.ModuleList(state_dict.values())
302
+ self.mapping = dict(enumerate(state_dict.keys()))
303
+ self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())}
304
+
305
+ # .processor for unet, .self_attn for text encoder
306
+ self.split_keys = [".processor", ".self_attn"]
307
+
308
+ # we add a hook to state_dict() and load_state_dict() so that the
309
+ # naming fits with `unet.attn_processors`
310
+ def map_to(module, state_dict, *args, **kwargs):
311
+ new_state_dict = {}
312
+ for key, value in state_dict.items():
313
+ num = int(key.split(".")[1]) # 0 is always "layers"
314
+ new_key = key.replace(f"layers.{num}", module.mapping[num])
315
+ new_state_dict[new_key] = value
316
+
317
+ return new_state_dict
318
+
319
+ def remap_key(key, state_dict):
320
+ for k in self.split_keys:
321
+ if k in key:
322
+ return key.split(k)[0] + k
323
+ return key.split(".")[0]
324
+
325
+ def map_from(module, state_dict, *args, **kwargs):
326
+ all_keys = list(state_dict.keys())
327
+ for key in all_keys:
328
+ replace_key = remap_key(key, state_dict)
329
+ new_key = key.replace(
330
+ replace_key, f"layers.{module.rev_mapping[replace_key]}"
331
+ )
332
+ state_dict[new_key] = state_dict[key]
333
+ del state_dict[key]
334
+
335
+ self._register_state_dict_hook(map_to)
336
+ self._register_load_state_dict_pre_hook(map_from, with_module=True)
337
+
338
+
339
+ class Zero123PlusPipelineOutput(BaseOutput):
340
+ images: torch.Tensor
341
+
342
+
343
+ class Zero123PlusPipeline(diffusers.StableDiffusionPipeline):
344
+ tokenizer: transformers.CLIPTokenizer
345
+ text_encoder: transformers.CLIPTextModel
346
+ vision_encoder: transformers.CLIPVisionModelWithProjection
347
+
348
+ feature_extractor_clip: transformers.CLIPImageProcessor
349
+ unet: UNet2DConditionModel
350
+ scheduler: diffusers.schedulers.KarrasDiffusionSchedulers
351
+
352
+ vae: AutoencoderKL
353
+ ramping: nn.Linear
354
+
355
+ feature_extractor_vae: transformers.CLIPImageProcessor
356
+
357
+ depth_transforms_multi = transforms.Compose(
358
+ [transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
359
+ )
360
+
361
+ def __init__(
362
+ self,
363
+ vae: AutoencoderKL,
364
+ text_encoder: CLIPTextModel,
365
+ tokenizer: CLIPTokenizer,
366
+ unet: UNet2DConditionModel,
367
+ scheduler: KarrasDiffusionSchedulers,
368
+ vision_encoder: transformers.CLIPVisionModelWithProjection,
369
+ feature_extractor_clip: CLIPImageProcessor,
370
+ feature_extractor_vae: CLIPImageProcessor,
371
+ ramping_coefficients: Optional[list] = None,
372
+ safety_checker=None,
373
+ ):
374
+ DiffusionPipeline.__init__(self)
375
+
376
+ self.register_modules(
377
+ vae=vae,
378
+ text_encoder=text_encoder,
379
+ tokenizer=tokenizer,
380
+ unet=unet,
381
+ scheduler=scheduler,
382
+ safety_checker=None,
383
+ vision_encoder=vision_encoder,
384
+ feature_extractor_clip=feature_extractor_clip,
385
+ feature_extractor_vae=feature_extractor_vae,
386
+ )
387
+ self.register_to_config(ramping_coefficients=ramping_coefficients)
388
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
389
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
390
+
391
+ def prepare(self):
392
+ train_sched = DDPMScheduler.from_config(self.scheduler.config)
393
+ if isinstance(self.unet, UNet2DConditionModel):
394
+ self.unet = RefOnlyNoisedUNet(self.unet, train_sched, self.scheduler).eval()
395
+
396
+ def add_controlnet(
397
+ self,
398
+ controlnet: Optional[diffusers.ControlNetModel] = None,
399
+ conditioning_scale=1.0,
400
+ ):
401
+ self.prepare()
402
+ self.unet = DepthControlUNet(self.unet, controlnet, conditioning_scale)
403
+ return SuperNet(OrderedDict([("controlnet", self.unet.controlnet)]))
404
+
405
+ def encode_condition_image(self, image: torch.Tensor):
406
+ image = self.vae.encode(image).latent_dist.sample()
407
+ return image
408
+
409
+ @torch.no_grad()
410
+ def __call__(
411
+ self,
412
+ image: Image.Image = None,
413
+ prompt="",
414
+ *args,
415
+ num_images_per_prompt: Optional[int] = 1,
416
+ guidance_scale=4.0,
417
+ depth_image: Image.Image = None,
418
+ output_type: Optional[str] = "pil",
419
+ width=640,
420
+ height=960,
421
+ num_inference_steps=28,
422
+ return_dict=True,
423
+ **kwargs,
424
+ ):
425
+ self.prepare()
426
+ if image is None:
427
+ raise ValueError(
428
+ "Inputting embeddings not supported for this pipeline. Please pass an image."
429
+ )
430
+ assert not isinstance(image, torch.Tensor)
431
+
432
+ image = rembg.remove(image)
433
+
434
+ image = numpy.array(image)
435
+ alpha = numpy.where(image[..., 3] > 0)
436
+ y1, y2, x1, x2 = (
437
+ alpha[0].min(),
438
+ alpha[0].max(),
439
+ alpha[1].min(),
440
+ alpha[1].max(),
441
+ )
442
+ fg = image[y1:y2, x1:x2]
443
+ size = max(fg.shape[0], fg.shape[1])
444
+ ph0, pw0 = (size - fg.shape[0]) // 2, (size - fg.shape[1]) // 2
445
+ ph1, pw1 = size - fg.shape[0] - ph0, size - fg.shape[1] - pw0
446
+ image = numpy.pad(
447
+ fg,
448
+ ((ph0, ph1), (pw0, pw1), (0, 0)),
449
+ mode="constant",
450
+ constant_values=((0, 0), (0, 0), (0, 0)),
451
+ )
452
+
453
+ new_size = int(image.shape[0] / 0.85)
454
+ ph0, pw0 = (new_size - size) // 2, (new_size - size) // 2
455
+ ph1, pw1 = new_size - size - ph0, new_size - size - pw0
456
+ image = numpy.pad(
457
+ image,
458
+ ((ph0, ph1), (pw0, pw1), (0, 0)),
459
+ mode="constant",
460
+ constant_values=((0, 0), (0, 0), (0, 0)),
461
+ )
462
+ image = Image.fromarray(image)
463
+
464
+ # images = mv_pipeline(image).images[0]
465
+
466
+ image = to_rgb_image(image)
467
+ image_1 = self.feature_extractor_vae(
468
+ images=image, return_tensors="pt"
469
+ ).pixel_values
470
+ image_2 = self.feature_extractor_clip(
471
+ images=image, return_tensors="pt"
472
+ ).pixel_values
473
+ if depth_image is not None and hasattr(self.unet, "controlnet"):
474
+ depth_image = to_rgb_image(depth_image)
475
+ depth_image = self.depth_transforms_multi(depth_image).to(
476
+ device=self.unet.controlnet.device, dtype=self.unet.controlnet.dtype
477
+ )
478
+ image = image_1.to(device=self.vae.device, dtype=self.vae.dtype)
479
+ image_2 = image_2.to(device=self.vae.device, dtype=self.vae.dtype)
480
+ cond_lat = self.encode_condition_image(image)
481
+ if guidance_scale > 1:
482
+ negative_lat = self.encode_condition_image(torch.zeros_like(image))
483
+ cond_lat = torch.cat([negative_lat, cond_lat])
484
+ encoded = self.vision_encoder(image_2, output_hidden_states=False)
485
+ global_embeds = encoded.image_embeds
486
+ global_embeds = global_embeds.unsqueeze(-2)
487
+
488
+ if hasattr(self, "encode_prompt"):
489
+ encoder_hidden_states = self.encode_prompt(
490
+ prompt, self.device, num_images_per_prompt, False
491
+ )[0]
492
+ else:
493
+ encoder_hidden_states = self._encode_prompt(
494
+ prompt, self.device, num_images_per_prompt, False
495
+ )
496
+ ramp = global_embeds.new_tensor(self.config.ramping_coefficients).unsqueeze(-1)
497
+ encoder_hidden_states = encoder_hidden_states + global_embeds * ramp
498
+ cak = dict(cond_lat=cond_lat)
499
+ if hasattr(self.unet, "controlnet"):
500
+ cak["control_depth"] = depth_image
501
+ latents: torch.Tensor = (
502
+ super()
503
+ .__call__(
504
+ None,
505
+ *args,
506
+ cross_attention_kwargs=cak,
507
+ guidance_scale=guidance_scale,
508
+ num_images_per_prompt=num_images_per_prompt,
509
+ prompt_embeds=encoder_hidden_states,
510
+ num_inference_steps=num_inference_steps,
511
+ output_type="latent",
512
+ width=width,
513
+ height=height,
514
+ **kwargs,
515
+ )
516
+ .images
517
+ )
518
+ latents = unscale_latents(latents)
519
+ if not output_type == "latent":
520
+ image = unscale_image(
521
+ self.vae.decode(
522
+ latents / self.vae.config.scaling_factor, return_dict=False
523
+ )[0]
524
+ )
525
+ else:
526
+ image = latents
527
+
528
+ image = self.image_processor.postprocess(image, output_type=output_type)
529
+ if not return_dict:
530
+ return (image,)
531
+
532
+ images = numpy.asarray(image[0], dtype=numpy.float32) / 255.0
533
+ images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float()
534
+
535
+ n, m = 3, 2
536
+ c, h, w = images.shape
537
+ images = (
538
+ images.view(c, n, h // n, m, w // m).permute(1, 3, 0, 2, 4).contiguous()
539
+ )
540
+ images = images.view(n * m, c, h // n, w // m)
541
+
542
+ images = images.unsqueeze(0)
543
+ images = v2.functional.resize(
544
+ images, 320, interpolation=3, antialias=True
545
+ ).clamp(0, 1)
546
+
547
+ return Zero123PlusPipelineOutput(images=images)
unet/config.json DELETED
@@ -1,73 +0,0 @@
1
- {
2
- "_class_name": "UNet2DConditionModel",
3
- "_diffusers_version": "0.30.3",
4
- "_name_or_path": "/home/dylan/.cache/huggingface/hub/models--sudo-ai--zero123plus-v1.2/snapshots/2da07e89919e1a130c9b5add1584c70c7aa065fd/unet",
5
- "act_fn": "silu",
6
- "addition_embed_type": null,
7
- "addition_embed_type_num_heads": 64,
8
- "addition_time_embed_dim": null,
9
- "attention_head_dim": [
10
- 5,
11
- 10,
12
- 20,
13
- 20
14
- ],
15
- "attention_type": "default",
16
- "block_out_channels": [
17
- 320,
18
- 640,
19
- 1280,
20
- 1280
21
- ],
22
- "center_input_sample": false,
23
- "class_embed_type": null,
24
- "class_embeddings_concat": false,
25
- "conv_in_kernel": 3,
26
- "conv_out_kernel": 3,
27
- "cross_attention_dim": 1024,
28
- "cross_attention_norm": null,
29
- "down_block_types": [
30
- "CrossAttnDownBlock2D",
31
- "CrossAttnDownBlock2D",
32
- "CrossAttnDownBlock2D",
33
- "DownBlock2D"
34
- ],
35
- "downsample_padding": 1,
36
- "dropout": 0.0,
37
- "dual_cross_attention": false,
38
- "encoder_hid_dim": null,
39
- "encoder_hid_dim_type": null,
40
- "flip_sin_to_cos": true,
41
- "freq_shift": 0,
42
- "in_channels": 4,
43
- "layers_per_block": 2,
44
- "mid_block_only_cross_attention": null,
45
- "mid_block_scale_factor": 1,
46
- "mid_block_type": "UNetMidBlock2DCrossAttn",
47
- "norm_eps": 1e-05,
48
- "norm_num_groups": 32,
49
- "num_attention_heads": null,
50
- "num_class_embeds": null,
51
- "only_cross_attention": false,
52
- "out_channels": 4,
53
- "projection_class_embeddings_input_dim": null,
54
- "resnet_out_scale_factor": 1.0,
55
- "resnet_skip_time_act": false,
56
- "resnet_time_scale_shift": "default",
57
- "reverse_transformer_layers_per_block": null,
58
- "sample_size": 96,
59
- "time_cond_proj_dim": null,
60
- "time_embedding_act_fn": null,
61
- "time_embedding_dim": null,
62
- "time_embedding_type": "positional",
63
- "timestep_post_act": null,
64
- "transformer_layers_per_block": 1,
65
- "up_block_types": [
66
- "UpBlock2D",
67
- "CrossAttnUpBlock2D",
68
- "CrossAttnUpBlock2D",
69
- "CrossAttnUpBlock2D"
70
- ],
71
- "upcast_attention": false,
72
- "use_linear_projection": true
73
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
unet/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4cba18336cfeb369d18dca0b1af3b9268302d828d7eee871d22074d08b91b33
3
- size 1731904736