MaxMilan1 commited on
Commit
2c2acce
1 Parent(s): aa8461a

change to InstantMesh

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +2 -2
  2. app.py +348 -252
  3. ckpts/shoes.safetensors +0 -3
  4. ckpts/snckrsgen.safetensors +0 -3
  5. configs/ae/video.yaml +0 -35
  6. configs/embedder/clip_image.yaml +0 -8
  7. configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml +0 -104
  8. configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml +0 -105
  9. configs/example_training/imagenet-f8_cond.yaml +0 -185
  10. configs/example_training/toy/cifar10_cond.yaml +0 -98
  11. configs/example_training/toy/mnist.yaml +0 -79
  12. configs/example_training/toy/mnist_cond.yaml +0 -98
  13. configs/example_training/toy/mnist_cond_discrete_eps.yaml +0 -103
  14. configs/example_training/toy/mnist_cond_l1_loss.yaml +0 -99
  15. configs/example_training/toy/mnist_cond_with_ema.yaml +0 -100
  16. configs/example_training/txt2img-clipl-legacy-ucg-training.yaml +0 -182
  17. configs/example_training/txt2img-clipl.yaml +0 -184
  18. configs/inference/sd_2_1.yaml +0 -60
  19. configs/inference/sd_2_1_768.yaml +0 -60
  20. configs/inference/sd_xl_base.yaml +0 -93
  21. configs/inference/sd_xl_refiner.yaml +0 -86
  22. configs/inference/svd.yaml +0 -131
  23. configs/inference/svd_image_decoder.yaml +0 -114
  24. configs/inference/svd_mv.yaml +0 -202
  25. configs/instant-mesh-base.yaml +22 -0
  26. configs/instant-mesh-large.yaml +22 -0
  27. configs/instant-nerf-base.yaml +21 -0
  28. configs/instant-nerf-large.yaml +21 -0
  29. examples/bird.jpg +0 -0
  30. examples/bubble_mart_blue.png +0 -0
  31. examples/cartoon_dinosaur.png +0 -0
  32. examples/cartoon_girl.jpg +0 -0
  33. examples/chair_armed.png +0 -0
  34. examples/chair_comfort.jpg +0 -0
  35. examples/chair_wood.jpg +0 -0
  36. examples/chest.jpg +0 -0
  37. examples/fruit_bycycle.jpg +0 -0
  38. examples/fruit_elephant.jpg +0 -0
  39. examples/genshin_building.png +0 -0
  40. examples/genshin_teapot.png +0 -0
  41. examples/hatsune_miku.png +0 -0
  42. examples/house2.jpg +0 -0
  43. examples/mushroom_teapot.jpg +0 -0
  44. examples/pikachu.png +0 -0
  45. examples/plant.jpg +0 -0
  46. examples/robot.jpg +0 -0
  47. examples/sea_turtle.png +0 -0
  48. examples/skating_shoe.jpg +0 -0
  49. examples/sorting_board.png +0 -0
  50. examples/sword.png +0 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: ShoeGen + V3D
3
  emoji: 🏆
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: InstantMesh
3
  emoji: 🏆
4
  colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -1,271 +1,367 @@
1
- # TODO
 
 
 
2
  import numpy as np
3
- import argparse
4
  import torch
5
- from torchvision.utils import make_grid
6
- import tempfile
7
- import gradio as gr
8
- from omegaconf import OmegaConf
9
- from einops import rearrange
10
- from scripts.pub.V3D_512 import (
11
- sample_one,
12
- get_batch,
13
- get_unique_embedder_keys_from_conditioner,
14
- load_model,
15
- )
16
- from sgm.util import default, instantiate_from_config
17
- from safetensors.torch import load_file as load_safetensors
18
  from PIL import Image
19
- from kiui.op import recenter
20
- from torchvision.transforms import ToTensor
 
21
  from einops import rearrange, repeat
22
- import rembg
23
- import os
24
- from glob import glob
25
- from mediapy import write_video
26
- from pathlib import Path
27
- import spaces
 
 
 
 
 
 
 
 
 
28
  from huggingface_hub import hf_hub_download
29
- import imageio
30
 
31
- import cv2
32
 
33
 
34
- @spaces.GPU
35
- def do_sample(
36
- image,
37
- num_frames,
38
- num_steps,
39
- decoding_t,
40
- border_ratio,
41
- ignore_alpha,
42
- output_folder,
43
- seed,
44
- ):
45
- # if image.mode == "RGBA":
46
- # image = image.convert("RGB")
47
- torch.manual_seed(seed)
48
- image = Image.fromarray(image)
49
- w, h = image.size
50
-
51
- if border_ratio > 0:
52
- if image.mode != "RGBA" or ignore_alpha:
53
- image = image.convert("RGB")
54
- image = np.asarray(image)
55
- carved_image = rembg.remove(image, session=rembg_session) # [H, W, 4]
56
- else:
57
- image = np.asarray(image)
58
- carved_image = image
59
- mask = carved_image[..., -1] > 0
60
- image = recenter(carved_image, mask, border_ratio=border_ratio)
61
- image = image.astype(np.float32) / 255.0
62
- if image.shape[-1] == 4:
63
- image = image[..., :3] * image[..., 3:4] + (1 - image[..., 3:4])
64
- image = Image.fromarray((image * 255).astype(np.uint8))
65
  else:
66
- print("Ignore border ratio")
67
- image = image.resize((512, 512))
68
-
69
- image = ToTensor()(image)
70
- image = image * 2.0 - 1.0
71
-
72
- image = image.unsqueeze(0).to(device)
73
- H, W = image.shape[2:]
74
- assert image.shape[1] == 3
75
- F = 8
76
- C = 4
77
- shape = (num_frames, C, H // F, W // F)
78
-
79
- value_dict = {}
80
- value_dict["motion_bucket_id"] = 0
81
- value_dict["fps_id"] = 0
82
- value_dict["cond_aug"] = 0.05
83
- value_dict["cond_frames_without_noise"] = clip_model(image)
84
- value_dict["cond_frames"] = ae_model.encode(image)
85
- value_dict["cond_frames"] += 0.05 * torch.randn_like(value_dict["cond_frames"])
86
- value_dict["cond_aug"] = 0.05
87
-
88
- print(device)
89
- with torch.no_grad():
90
- with torch.autocast(device_type="cuda"):
91
- batch, batch_uc = get_batch(
92
- get_unique_embedder_keys_from_conditioner(model.conditioner),
93
- value_dict,
94
- [1, num_frames],
95
- T=num_frames,
96
- device=device,
97
- )
98
- c, uc = model.conditioner.get_unconditional_conditioning(
99
- batch,
100
- batch_uc=batch_uc,
101
- force_uc_zero_embeddings=[
102
- "cond_frames",
103
- "cond_frames_without_noise",
104
- ],
105
- )
106
-
107
- for k in ["crossattn", "concat"]:
108
- uc[k] = repeat(uc[k], "b ... -> b t ...", t=num_frames)
109
- uc[k] = rearrange(uc[k], "b t ... -> (b t) ...", t=num_frames)
110
- c[k] = repeat(c[k], "b ... -> b t ...", t=num_frames)
111
- c[k] = rearrange(c[k], "b t ... -> (b t) ...", t=num_frames)
112
-
113
- randn = torch.randn(shape, device=device)
114
- randn = randn.to(device)
115
-
116
- additional_model_inputs = {}
117
- additional_model_inputs["image_only_indicator"] = torch.zeros(
118
- 2, num_frames
119
- ).to(device)
120
- additional_model_inputs["num_video_frames"] = batch["num_video_frames"]
121
-
122
- def denoiser(input, sigma, c):
123
- return model.denoiser(
124
- model.model, input, sigma, c, **additional_model_inputs
125
- )
126
 
127
- samples_z = model.sampler(denoiser, randn, cond=c, uc=uc)
128
- model.en_and_decode_n_samples_a_time = decoding_t
129
- samples_x = model.decode_first_stage(samples_z)
130
- samples = torch.clamp((samples_x + 1.0) / 2.0, min=0.0, max=1.0)
131
-
132
- os.makedirs(output_folder, exist_ok=True)
133
- base_count = len(glob(os.path.join(output_folder, "*.mp4")))
134
- video_path = os.path.join(output_folder, f"{base_count:06d}.mp4")
135
-
136
- frames = (
137
- (rearrange(samples, "t c h w -> t h w c") * 255)
138
- .cpu()
139
- .numpy()
140
- .astype(np.uint8)
141
- )
142
- # write_video(video_path, frames, fps=6)
143
- # writer = cv2.VideoWriter(
144
- # video_path,
145
- # cv2.VideoWriter_fourcc("m", "p", "4", "v"),
146
- # 6,
147
- # (frames.shape[-1], frames.shape[-2]),
148
- # )
149
- # for fr in frames:
150
- # writer.write(cv2.cvtColor(fr, cv2.COLOR_RGB2BGR))
151
- # writer.release()
152
- imageio.mimwrite(video_path, frames, fps=6)
153
-
154
- return video_path
155
-
156
-
157
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
158
-
159
- # download
160
- V3D_ckpt_path = hf_hub_download(repo_id="heheyas/V3D", filename="V3D.ckpt")
161
- svd_xt_ckpt_path = hf_hub_download(
162
- repo_id="stabilityai/stable-video-diffusion-img2vid-xt",
163
- filename="svd_xt.safetensors",
164
- )
165
 
166
- model_config = "./scripts/pub/configs/V3D_512.yaml"
167
- num_frames = OmegaConf.load(
168
- model_config
169
- ).model.params.sampler_config.params.guider_config.params.num_frames
170
- print("Detected num_frames:", num_frames)
171
- # num_steps = default(num_steps, 25)
172
- num_steps = 25
173
- output_folder = "outputs/V3D_512"
174
-
175
- sd = load_safetensors(svd_xt_ckpt_path)
176
- clip_model_config = OmegaConf.load("./configs/embedder/clip_image.yaml")
177
- clip_model = instantiate_from_config(clip_model_config).eval()
178
- clip_sd = dict()
179
- for k, v in sd.items():
180
- if "conditioner.embedders.0" in k:
181
- clip_sd[k.replace("conditioner.embedders.0.", "")] = v
182
- clip_model.load_state_dict(clip_sd)
183
- clip_model = clip_model.to(device)
184
-
185
- ae_model_config = OmegaConf.load("./configs/ae/video.yaml")
186
- ae_model = instantiate_from_config(ae_model_config).eval()
187
- encoder_sd = dict()
188
- for k, v in sd.items():
189
- if "first_stage_model" in k:
190
- encoder_sd[k.replace("first_stage_model.", "")] = v
191
- ae_model.load_state_dict(encoder_sd)
192
- ae_model = ae_model.to(device)
193
- rembg_session = rembg.new_session()
194
-
195
- model, _ = load_model(
196
- model_config,
197
- device,
198
- num_frames,
199
- num_steps,
200
- min_cfg=3.5,
201
- max_cfg=3.5,
202
- ckpt_path=V3D_ckpt_path,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  model = model.to(device)
205
 
206
- with gr.Blocks(title="V3D", theme=gr.themes.Monochrome()) as demo:
207
- with gr.Row(equal_height=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  with gr.Column():
209
- input_image = gr.Image(value=None, label="Input Image")
210
-
211
- border_ratio_slider = gr.Slider(
212
- value=0.3,
213
- label="Border Ratio",
214
- minimum=0.05,
215
- maximum=0.5,
216
- step=0.05,
217
- )
218
- seed_input = gr.Number(value=42)
219
- decoding_t_slider = gr.Slider(
220
- value=1,
221
- label="Number of Decoding frames",
222
- minimum=1,
223
- maximum=num_frames,
224
- step=1,
225
- )
226
- min_guidance_slider = gr.Slider(
227
- value=3.5,
228
- label="Min CFG Value",
229
- minimum=0.05,
230
- maximum=5,
231
- step=0.05,
232
- )
233
- max_guidance_slider = gr.Slider(
234
- value=3.5,
235
- label="Max CFG Value",
236
- minimum=0.05,
237
- maximum=5,
238
- step=0.05,
239
- )
240
- run_button = gr.Button(value="Run V3D")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  with gr.Column():
243
- output_video = gr.Video(value=None, label="Output Orbit Video")
244
-
245
- @run_button.click(
246
- inputs=[
247
- input_image,
248
- border_ratio_slider,
249
- min_guidance_slider,
250
- max_guidance_slider,
251
- decoding_t_slider,
252
- seed_input,
253
- ],
254
- outputs=[output_video],
255
- )
256
- def _(image, border_ratio, min_guidance, max_guidance, decoding_t, seed):
257
- model.sampler.guider.max_scale = max_guidance
258
- model.sampler.guider.min_scale = min_guidance
259
- return do_sample(
260
- image,
261
- num_frames,
262
- num_steps,
263
- int(decoding_t),
264
- border_ratio,
265
- False,
266
- output_folder,
267
- seed,
268
- )
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
- demo.launch()
 
1
+ import spaces
2
+
3
+ import os
4
+ import imageio
5
  import numpy as np
 
6
  import torch
7
+ import rembg
 
 
 
 
 
 
 
 
 
 
 
 
8
  from PIL import Image
9
+ from torchvision.transforms import v2
10
+ from pytorch_lightning import seed_everything
11
+ from omegaconf import OmegaConf
12
  from einops import rearrange, repeat
13
+ from tqdm import tqdm
14
+ from diffusers import DiffusionPipeline, EulerAncestralDiscreteScheduler
15
+
16
+ from src.utils.train_util import instantiate_from_config
17
+ from src.utils.camera_util import (
18
+ FOV_to_intrinsics,
19
+ get_zero123plus_input_cameras,
20
+ get_circular_camera_poses,
21
+ )
22
+ from src.utils.mesh_util import save_obj
23
+ from src.utils.infer_util import remove_background, resize_foreground, images_to_video
24
+
25
+ import tempfile
26
+ from functools import partial
27
+
28
  from huggingface_hub import hf_hub_download
 
29
 
30
+ import gradio as gr
31
 
32
 
33
+ def get_render_cameras(batch_size=1, M=120, radius=2.5, elevation=10.0, is_flexicubes=False):
34
+ """
35
+ Get the rendering camera parameters.
36
+ """
37
+ c2ws = get_circular_camera_poses(M=M, radius=radius, elevation=elevation)
38
+ if is_flexicubes:
39
+ cameras = torch.linalg.inv(c2ws)
40
+ cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1, 1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  else:
42
+ extrinsics = c2ws.flatten(-2)
43
+ intrinsics = FOV_to_intrinsics(50.0).unsqueeze(0).repeat(M, 1, 1).float().flatten(-2)
44
+ cameras = torch.cat([extrinsics, intrinsics], dim=-1)
45
+ cameras = cameras.unsqueeze(0).repeat(batch_size, 1, 1)
46
+ return cameras
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ def images_to_video(images, output_path, fps=30):
50
+ # images: (N, C, H, W)
51
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
52
+ frames = []
53
+ for i in range(images.shape[0]):
54
+ frame = (images[i].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8).clip(0, 255)
55
+ assert frame.shape[0] == images.shape[2] and frame.shape[1] == images.shape[3], \
56
+ f"Frame shape mismatch: {frame.shape} vs {images.shape}"
57
+ assert frame.min() >= 0 and frame.max() <= 255, \
58
+ f"Frame value out of range: {frame.min()} ~ {frame.max()}"
59
+ frames.append(frame)
60
+ imageio.mimwrite(output_path, np.stack(frames), fps=fps, codec='h264')
61
+
62
+
63
+ ###############################################################################
64
+ # Configuration.
65
+ ###############################################################################
66
+
67
+ import shutil
68
+
69
+ def find_cuda():
70
+ # Check if CUDA_HOME or CUDA_PATH environment variables are set
71
+ cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
72
+
73
+ if cuda_home and os.path.exists(cuda_home):
74
+ return cuda_home
75
+
76
+ # Search for the nvcc executable in the system's PATH
77
+ nvcc_path = shutil.which('nvcc')
78
+
79
+ if nvcc_path:
80
+ # Remove the 'bin/nvcc' part to get the CUDA installation path
81
+ cuda_path = os.path.dirname(os.path.dirname(nvcc_path))
82
+ return cuda_path
83
+
84
+ return None
85
+
86
+ cuda_path = find_cuda()
87
+
88
+ if cuda_path:
89
+ print(f"CUDA installation found at: {cuda_path}")
90
+ else:
91
+ print("CUDA installation not found")
92
+
93
+ config_path = 'configs/instant-mesh-large.yaml'
94
+ config = OmegaConf.load(config_path)
95
+ config_name = os.path.basename(config_path).replace('.yaml', '')
96
+ model_config = config.model_config
97
+ infer_config = config.infer_config
98
+
99
+ IS_FLEXICUBES = True if config_name.startswith('instant-mesh') else False
100
+
101
+ device = torch.device('cuda')
102
+
103
+ # load diffusion model
104
+ print('Loading diffusion model ...')
105
+ pipeline = DiffusionPipeline.from_pretrained(
106
+ "sudo-ai/zero123plus-v1.2",
107
+ custom_pipeline="zero123plus",
108
+ torch_dtype=torch.float16,
109
+ )
110
+ pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(
111
+ pipeline.scheduler.config, timestep_spacing='trailing'
112
  )
113
+
114
+ # load custom white-background UNet
115
+ unet_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="diffusion_pytorch_model.bin", repo_type="model")
116
+ state_dict = torch.load(unet_ckpt_path, map_location='cpu')
117
+ pipeline.unet.load_state_dict(state_dict, strict=True)
118
+
119
+ pipeline = pipeline.to(device)
120
+
121
+ # load reconstruction model
122
+ print('Loading reconstruction model ...')
123
+ model_ckpt_path = hf_hub_download(repo_id="TencentARC/InstantMesh", filename="instant_mesh_large.ckpt", repo_type="model")
124
+ model = instantiate_from_config(model_config)
125
+ state_dict = torch.load(model_ckpt_path, map_location='cpu')['state_dict']
126
+ state_dict = {k[14:]: v for k, v in state_dict.items() if k.startswith('lrm_generator.') and 'source_camera' not in k}
127
+ model.load_state_dict(state_dict, strict=True)
128
+
129
  model = model.to(device)
130
 
131
+ print('Loading Finished!')
132
+
133
+
134
+ def check_input_image(input_image):
135
+ if input_image is None:
136
+ raise gr.Error("No image uploaded!")
137
+
138
+
139
+ def preprocess(input_image, do_remove_background):
140
+
141
+ rembg_session = rembg.new_session() if do_remove_background else None
142
+
143
+ if do_remove_background:
144
+ input_image = remove_background(input_image, rembg_session)
145
+ input_image = resize_foreground(input_image, 0.85)
146
+
147
+ return input_image
148
+
149
+
150
+ @spaces.GPU
151
+ def generate_mvs(input_image, sample_steps, sample_seed):
152
+
153
+ seed_everything(sample_seed)
154
+
155
+ # sampling
156
+ z123_image = pipeline(
157
+ input_image,
158
+ num_inference_steps=sample_steps
159
+ ).images[0]
160
+
161
+ show_image = np.asarray(z123_image, dtype=np.uint8)
162
+ show_image = torch.from_numpy(show_image) # (960, 640, 3)
163
+ show_image = rearrange(show_image, '(n h) (m w) c -> (n m) h w c', n=3, m=2)
164
+ show_image = rearrange(show_image, '(n m) h w c -> (n h) (m w) c', n=2, m=3)
165
+ show_image = Image.fromarray(show_image.numpy())
166
+
167
+ return z123_image, show_image
168
+
169
+
170
+ @spaces.GPU
171
+ def make3d(images):
172
+
173
+ global model
174
+ if IS_FLEXICUBES:
175
+ model.init_flexicubes_geometry(device, use_renderer=False)
176
+ model = model.eval()
177
+
178
+ images = np.asarray(images, dtype=np.float32) / 255.0
179
+ images = torch.from_numpy(images).permute(2, 0, 1).contiguous().float() # (3, 960, 640)
180
+ images = rearrange(images, 'c (n h) (m w) -> (n m) c h w', n=3, m=2) # (6, 3, 320, 320)
181
+
182
+ input_cameras = get_zero123plus_input_cameras(batch_size=1, radius=4.0).to(device)
183
+ render_cameras = get_render_cameras(batch_size=1, radius=2.5, is_flexicubes=IS_FLEXICUBES).to(device)
184
+
185
+ images = images.unsqueeze(0).to(device)
186
+ images = v2.functional.resize(images, (320, 320), interpolation=3, antialias=True).clamp(0, 1)
187
+
188
+ mesh_fpath = tempfile.NamedTemporaryFile(suffix=f".obj", delete=False).name
189
+ print(mesh_fpath)
190
+ mesh_basename = os.path.basename(mesh_fpath).split('.')[0]
191
+ mesh_dirname = os.path.dirname(mesh_fpath)
192
+ video_fpath = os.path.join(mesh_dirname, f"{mesh_basename}.mp4")
193
+
194
+ with torch.no_grad():
195
+ # get triplane
196
+ planes = model.forward_planes(images, input_cameras)
197
+
198
+ # # get video
199
+ # chunk_size = 20 if IS_FLEXICUBES else 1
200
+ # render_size = 384
201
+
202
+ # frames = []
203
+ # for i in tqdm(range(0, render_cameras.shape[1], chunk_size)):
204
+ # if IS_FLEXICUBES:
205
+ # frame = model.forward_geometry(
206
+ # planes,
207
+ # render_cameras[:, i:i+chunk_size],
208
+ # render_size=render_size,
209
+ # )['img']
210
+ # else:
211
+ # frame = model.synthesizer(
212
+ # planes,
213
+ # cameras=render_cameras[:, i:i+chunk_size],
214
+ # render_size=render_size,
215
+ # )['images_rgb']
216
+ # frames.append(frame)
217
+ # frames = torch.cat(frames, dim=1)
218
+
219
+ # images_to_video(
220
+ # frames[0],
221
+ # video_fpath,
222
+ # fps=30,
223
+ # )
224
+
225
+ # print(f"Video saved to {video_fpath}")
226
+
227
+ # get mesh
228
+ mesh_out = model.extract_mesh(
229
+ planes,
230
+ use_texture_map=False,
231
+ **infer_config,
232
+ )
233
+
234
+ vertices, faces, vertex_colors = mesh_out
235
+ vertices = vertices[:, [1, 2, 0]]
236
+ vertices[:, -1] *= -1
237
+ faces = faces[:, [2, 1, 0]]
238
+
239
+ save_obj(vertices, faces, vertex_colors, mesh_fpath)
240
+
241
+ print(f"Mesh saved to {mesh_fpath}")
242
+
243
+ return mesh_fpath
244
+
245
+
246
+ _HEADER_ = '''
247
+ <h2><b>Official 🤗 Gradio Demo</b></h2><h2><a href='https://github.com/TencentARC/InstantMesh' target='_blank'><b>InstantMesh: Efficient 3D Mesh Generation from a Single Image with Sparse-view Large Reconstruction Models</b></a></h2>
248
+ '''
249
+
250
+ _LINKS_ = '''
251
+ <h3>Code is available at <a href='https://github.com/TencentARC/InstantMesh' target='_blank'>GitHub</a></h3>
252
+ <h3>Report is available at <a href='https://arxiv.org/abs/2404.07191' target='_blank'>ArXiv</a></h3>
253
+ '''
254
+
255
+ _CITE_ = r"""
256
+ ```bibtex
257
+ @article{xu2024instantmesh,
258
+ title={InstantMesh: Efficient 3D Mesh Generation from a Single Image with Sparse-view Large Reconstruction Models},
259
+ author={Xu, Jiale and Cheng, Weihao and Gao, Yiming and Wang, Xintao and Gao, Shenghua and Shan, Ying},
260
+ journal={arXiv preprint arXiv:2404.07191},
261
+ year={2024}
262
+ }
263
+ ```
264
+ """
265
+
266
+
267
+ with gr.Blocks() as demo:
268
+ gr.Markdown(_HEADER_)
269
+ with gr.Row(variant="panel"):
270
  with gr.Column():
271
+ with gr.Row():
272
+ input_image = gr.Image(
273
+ label="Input Image",
274
+ image_mode="RGBA",
275
+ sources="upload",
276
+ #width=256,
277
+ #height=256,
278
+ type="pil",
279
+ elem_id="content_image",
280
+ )
281
+ processed_image = gr.Image(
282
+ label="Processed Image",
283
+ image_mode="RGBA",
284
+ #width=256,
285
+ #height=256,
286
+ type="pil",
287
+ interactive=False
288
+ )
289
+ with gr.Row():
290
+ with gr.Group():
291
+ do_remove_background = gr.Checkbox(
292
+ label="Remove Background", value=True
293
+ )
294
+ sample_seed = gr.Number(value=42, label="Seed Value", precision=0)
295
+
296
+ sample_steps = gr.Slider(
297
+ label="Sample Steps",
298
+ minimum=30,
299
+ maximum=75,
300
+ value=75,
301
+ step=5
302
+ )
303
+
304
+ with gr.Row():
305
+ submit = gr.Button("Generate", elem_id="generate", variant="primary")
306
+
307
+ with gr.Row(variant="panel"):
308
+ gr.Examples(
309
+ examples=[
310
+ os.path.join("examples", img_name) for img_name in sorted(os.listdir("examples"))
311
+ ],
312
+ inputs=[input_image],
313
+ label="Examples",
314
+ cache_examples=False,
315
+ examples_per_page=12
316
+ )
317
 
318
  with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ with gr.Row():
321
+
322
+ with gr.Column():
323
+ mv_show_images = gr.Image(
324
+ label="Generated Multi-views",
325
+ type="pil",
326
+ width=379,
327
+ interactive=False
328
+ )
329
+
330
+ # with gr.Column():
331
+ # output_video = gr.Video(
332
+ # label="video", format="mp4",
333
+ # width=379,
334
+ # autoplay=True,
335
+ # interactive=False
336
+ # )
337
+
338
+ with gr.Row():
339
+ output_model_obj = gr.Model3D(
340
+ label="Output Model (OBJ Format)",
341
+ interactive=False,
342
+ )
343
+
344
+ with gr.Row():
345
+ gr.Markdown('''Try a different <b>seed value</b> if the result is unsatisfying (Default: 42).''')
346
+
347
+ gr.Markdown(_LINKS_)
348
+ gr.Markdown(_CITE_)
349
+
350
+ mv_images = gr.State()
351
+
352
+ submit.click(fn=check_input_image, inputs=[input_image]).success(
353
+ fn=preprocess,
354
+ inputs=[input_image, do_remove_background],
355
+ outputs=[processed_image],
356
+ ).success(
357
+ fn=generate_mvs,
358
+ inputs=[processed_image, sample_steps, sample_seed],
359
+ outputs=[mv_images, mv_show_images]
360
+
361
+ ).success(
362
+ fn=make3d,
363
+ inputs=[mv_images],
364
+ outputs=[output_model_obj]
365
+ )
366
 
367
+ demo.launch()
ckpts/shoes.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e66a57b2174aff462c3bc0c9f9e3b1142617d856a1f5ddbada3b696dcc057b73
3
- size 170543188
 
 
 
 
ckpts/snckrsgen.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e80bf5f4ded84793d74c9939b0fc1a09b76af31bafe2ac3190c21c9be5eb6965
3
- size 151112168
 
 
 
 
configs/ae/video.yaml DELETED
@@ -1,35 +0,0 @@
1
- target: sgm.models.autoencoder.AutoencodingEngine
2
- params:
3
- loss_config:
4
- target: torch.nn.Identity
5
- regularizer_config:
6
- target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
7
- encoder_config:
8
- target: sgm.modules.diffusionmodules.model.Encoder
9
- params:
10
- attn_type: vanilla
11
- double_z: True
12
- z_channels: 4
13
- resolution: 256
14
- in_channels: 3
15
- out_ch: 3
16
- ch: 128
17
- ch_mult: [1, 2, 4, 4]
18
- num_res_blocks: 2
19
- attn_resolutions: []
20
- dropout: 0.0
21
- decoder_config:
22
- target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
23
- params:
24
- attn_type: vanilla
25
- double_z: True
26
- z_channels: 4
27
- resolution: 256
28
- in_channels: 3
29
- out_ch: 3
30
- ch: 128
31
- ch_mult: [1, 2, 4, 4]
32
- num_res_blocks: 2
33
- attn_resolutions: []
34
- dropout: 0.0
35
- video_kernel_size: [3, 1, 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/embedder/clip_image.yaml DELETED
@@ -1,8 +0,0 @@
1
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
2
- params:
3
- n_cond_frames: 1
4
- n_copies: 1
5
- open_clip_embedding_config:
6
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
7
- params:
8
- freeze: True
 
 
 
 
 
 
 
 
 
configs/example_training/autoencoder/kl-f4/imagenet-attnfree-logvar.yaml DELETED
@@ -1,104 +0,0 @@
1
- model:
2
- base_learning_rate: 4.5e-6
3
- target: sgm.models.autoencoder.AutoencodingEngine
4
- params:
5
- input_key: jpg
6
- monitor: val/rec_loss
7
-
8
- loss_config:
9
- target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator
10
- params:
11
- perceptual_weight: 0.25
12
- disc_start: 20001
13
- disc_weight: 0.5
14
- learn_logvar: True
15
-
16
- regularization_weights:
17
- kl_loss: 1.0
18
-
19
- regularizer_config:
20
- target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
21
-
22
- encoder_config:
23
- target: sgm.modules.diffusionmodules.model.Encoder
24
- params:
25
- attn_type: none
26
- double_z: True
27
- z_channels: 4
28
- resolution: 256
29
- in_channels: 3
30
- out_ch: 3
31
- ch: 128
32
- ch_mult: [1, 2, 4]
33
- num_res_blocks: 4
34
- attn_resolutions: []
35
- dropout: 0.0
36
-
37
- decoder_config:
38
- target: sgm.modules.diffusionmodules.model.Decoder
39
- params: ${model.params.encoder_config.params}
40
-
41
- data:
42
- target: sgm.data.dataset.StableDataModuleFromConfig
43
- params:
44
- train:
45
- datapipeline:
46
- urls:
47
- - DATA-PATH
48
- pipeline_config:
49
- shardshuffle: 10000
50
- sample_shuffle: 10000
51
-
52
- decoders:
53
- - pil
54
-
55
- postprocessors:
56
- - target: sdata.mappers.TorchVisionImageTransforms
57
- params:
58
- key: jpg
59
- transforms:
60
- - target: torchvision.transforms.Resize
61
- params:
62
- size: 256
63
- interpolation: 3
64
- - target: torchvision.transforms.ToTensor
65
- - target: sdata.mappers.Rescaler
66
- - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
67
- params:
68
- h_key: height
69
- w_key: width
70
-
71
- loader:
72
- batch_size: 8
73
- num_workers: 4
74
-
75
-
76
- lightning:
77
- strategy:
78
- target: pytorch_lightning.strategies.DDPStrategy
79
- params:
80
- find_unused_parameters: True
81
-
82
- modelcheckpoint:
83
- params:
84
- every_n_train_steps: 5000
85
-
86
- callbacks:
87
- metrics_over_trainsteps_checkpoint:
88
- params:
89
- every_n_train_steps: 50000
90
-
91
- image_logger:
92
- target: main.ImageLogger
93
- params:
94
- enable_autocast: False
95
- batch_frequency: 1000
96
- max_images: 8
97
- increase_log_steps: True
98
-
99
- trainer:
100
- devices: 0,
101
- limit_val_batches: 50
102
- benchmark: True
103
- accumulate_grad_batches: 1
104
- val_check_interval: 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/autoencoder/kl-f4/imagenet-kl_f8_8chn.yaml DELETED
@@ -1,105 +0,0 @@
1
- model:
2
- base_learning_rate: 4.5e-6
3
- target: sgm.models.autoencoder.AutoencodingEngine
4
- params:
5
- input_key: jpg
6
- monitor: val/loss/rec
7
- disc_start_iter: 0
8
-
9
- encoder_config:
10
- target: sgm.modules.diffusionmodules.model.Encoder
11
- params:
12
- attn_type: vanilla-xformers
13
- double_z: true
14
- z_channels: 8
15
- resolution: 256
16
- in_channels: 3
17
- out_ch: 3
18
- ch: 128
19
- ch_mult: [1, 2, 4, 4]
20
- num_res_blocks: 2
21
- attn_resolutions: []
22
- dropout: 0.0
23
-
24
- decoder_config:
25
- target: sgm.modules.diffusionmodules.model.Decoder
26
- params: ${model.params.encoder_config.params}
27
-
28
- regularizer_config:
29
- target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
30
-
31
- loss_config:
32
- target: sgm.modules.autoencoding.losses.GeneralLPIPSWithDiscriminator
33
- params:
34
- perceptual_weight: 0.25
35
- disc_start: 20001
36
- disc_weight: 0.5
37
- learn_logvar: True
38
-
39
- regularization_weights:
40
- kl_loss: 1.0
41
-
42
- data:
43
- target: sgm.data.dataset.StableDataModuleFromConfig
44
- params:
45
- train:
46
- datapipeline:
47
- urls:
48
- - DATA-PATH
49
- pipeline_config:
50
- shardshuffle: 10000
51
- sample_shuffle: 10000
52
-
53
- decoders:
54
- - pil
55
-
56
- postprocessors:
57
- - target: sdata.mappers.TorchVisionImageTransforms
58
- params:
59
- key: jpg
60
- transforms:
61
- - target: torchvision.transforms.Resize
62
- params:
63
- size: 256
64
- interpolation: 3
65
- - target: torchvision.transforms.ToTensor
66
- - target: sdata.mappers.Rescaler
67
- - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
68
- params:
69
- h_key: height
70
- w_key: width
71
-
72
- loader:
73
- batch_size: 8
74
- num_workers: 4
75
-
76
-
77
- lightning:
78
- strategy:
79
- target: pytorch_lightning.strategies.DDPStrategy
80
- params:
81
- find_unused_parameters: True
82
-
83
- modelcheckpoint:
84
- params:
85
- every_n_train_steps: 5000
86
-
87
- callbacks:
88
- metrics_over_trainsteps_checkpoint:
89
- params:
90
- every_n_train_steps: 50000
91
-
92
- image_logger:
93
- target: main.ImageLogger
94
- params:
95
- enable_autocast: False
96
- batch_frequency: 1000
97
- max_images: 8
98
- increase_log_steps: True
99
-
100
- trainer:
101
- devices: 0,
102
- limit_val_batches: 50
103
- benchmark: True
104
- accumulate_grad_batches: 1
105
- val_check_interval: 10000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/imagenet-f8_cond.yaml DELETED
@@ -1,185 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- scale_factor: 0.13025
6
- disable_first_stage_autocast: True
7
- log_keys:
8
- - cls
9
-
10
- scheduler_config:
11
- target: sgm.lr_scheduler.LambdaLinearScheduler
12
- params:
13
- warm_up_steps: [10000]
14
- cycle_lengths: [10000000000000]
15
- f_start: [1.e-6]
16
- f_max: [1.]
17
- f_min: [1.]
18
-
19
- denoiser_config:
20
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
21
- params:
22
- num_idx: 1000
23
-
24
- scaling_config:
25
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
26
- discretization_config:
27
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
28
-
29
- network_config:
30
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
31
- params:
32
- use_checkpoint: True
33
- in_channels: 4
34
- out_channels: 4
35
- model_channels: 256
36
- attention_resolutions: [1, 2, 4]
37
- num_res_blocks: 2
38
- channel_mult: [1, 2, 4]
39
- num_head_channels: 64
40
- num_classes: sequential
41
- adm_in_channels: 1024
42
- transformer_depth: 1
43
- context_dim: 1024
44
- spatial_transformer_attn_type: softmax-xformers
45
-
46
- conditioner_config:
47
- target: sgm.modules.GeneralConditioner
48
- params:
49
- emb_models:
50
- - is_trainable: True
51
- input_key: cls
52
- ucg_rate: 0.2
53
- target: sgm.modules.encoders.modules.ClassEmbedder
54
- params:
55
- add_sequence_dim: True
56
- embed_dim: 1024
57
- n_classes: 1000
58
-
59
- - is_trainable: False
60
- ucg_rate: 0.2
61
- input_key: original_size_as_tuple
62
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
63
- params:
64
- outdim: 256
65
-
66
- - is_trainable: False
67
- input_key: crop_coords_top_left
68
- ucg_rate: 0.2
69
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
70
- params:
71
- outdim: 256
72
-
73
- first_stage_config:
74
- target: sgm.models.autoencoder.AutoencoderKL
75
- params:
76
- ckpt_path: CKPT_PATH
77
- embed_dim: 4
78
- monitor: val/rec_loss
79
- ddconfig:
80
- attn_type: vanilla-xformers
81
- double_z: true
82
- z_channels: 4
83
- resolution: 256
84
- in_channels: 3
85
- out_ch: 3
86
- ch: 128
87
- ch_mult: [1, 2, 4, 4]
88
- num_res_blocks: 2
89
- attn_resolutions: []
90
- dropout: 0.0
91
- lossconfig:
92
- target: torch.nn.Identity
93
-
94
- loss_fn_config:
95
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
96
- params:
97
- loss_weighting_config:
98
- target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
99
- sigma_sampler_config:
100
- target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
101
- params:
102
- num_idx: 1000
103
-
104
- discretization_config:
105
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
106
-
107
- sampler_config:
108
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
109
- params:
110
- num_steps: 50
111
-
112
- discretization_config:
113
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
114
-
115
- guider_config:
116
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
117
- params:
118
- scale: 5.0
119
-
120
- data:
121
- target: sgm.data.dataset.StableDataModuleFromConfig
122
- params:
123
- train:
124
- datapipeline:
125
- urls:
126
- # USER: adapt this path the root of your custom dataset
127
- - DATA_PATH
128
- pipeline_config:
129
- shardshuffle: 10000
130
- sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM
131
-
132
- decoders:
133
- - pil
134
-
135
- postprocessors:
136
- - target: sdata.mappers.TorchVisionImageTransforms
137
- params:
138
- key: jpg # USER: you might wanna adapt this for your custom dataset
139
- transforms:
140
- - target: torchvision.transforms.Resize
141
- params:
142
- size: 256
143
- interpolation: 3
144
- - target: torchvision.transforms.ToTensor
145
- - target: sdata.mappers.Rescaler
146
-
147
- - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
148
- params:
149
- h_key: height # USER: you might wanna adapt this for your custom dataset
150
- w_key: width # USER: you might wanna adapt this for your custom dataset
151
-
152
- loader:
153
- batch_size: 64
154
- num_workers: 6
155
-
156
- lightning:
157
- modelcheckpoint:
158
- params:
159
- every_n_train_steps: 5000
160
-
161
- callbacks:
162
- metrics_over_trainsteps_checkpoint:
163
- params:
164
- every_n_train_steps: 25000
165
-
166
- image_logger:
167
- target: main.ImageLogger
168
- params:
169
- disabled: False
170
- enable_autocast: False
171
- batch_frequency: 1000
172
- max_images: 8
173
- increase_log_steps: True
174
- log_first_step: False
175
- log_images_kwargs:
176
- use_ema_scope: False
177
- N: 8
178
- n_rows: 2
179
-
180
- trainer:
181
- devices: 0,
182
- benchmark: True
183
- num_sanity_val_steps: 0
184
- accumulate_grad_batches: 1
185
- max_epochs: 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/toy/cifar10_cond.yaml DELETED
@@ -1,98 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- denoiser_config:
6
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
- params:
8
- scaling_config:
9
- target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
- params:
11
- sigma_data: 1.0
12
-
13
- network_config:
14
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
- params:
16
- in_channels: 3
17
- out_channels: 3
18
- model_channels: 32
19
- attention_resolutions: []
20
- num_res_blocks: 4
21
- channel_mult: [1, 2, 2]
22
- num_head_channels: 32
23
- num_classes: sequential
24
- adm_in_channels: 128
25
-
26
- conditioner_config:
27
- target: sgm.modules.GeneralConditioner
28
- params:
29
- emb_models:
30
- - is_trainable: True
31
- input_key: cls
32
- ucg_rate: 0.2
33
- target: sgm.modules.encoders.modules.ClassEmbedder
34
- params:
35
- embed_dim: 128
36
- n_classes: 10
37
-
38
- first_stage_config:
39
- target: sgm.models.autoencoder.IdentityFirstStage
40
-
41
- loss_fn_config:
42
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
43
- params:
44
- loss_weighting_config:
45
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
46
- params:
47
- sigma_data: 1.0
48
- sigma_sampler_config:
49
- target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
50
-
51
- sampler_config:
52
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
53
- params:
54
- num_steps: 50
55
-
56
- discretization_config:
57
- target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
58
-
59
- guider_config:
60
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
61
- params:
62
- scale: 3.0
63
-
64
- data:
65
- target: sgm.data.cifar10.CIFAR10Loader
66
- params:
67
- batch_size: 512
68
- num_workers: 1
69
-
70
- lightning:
71
- modelcheckpoint:
72
- params:
73
- every_n_train_steps: 5000
74
-
75
- callbacks:
76
- metrics_over_trainsteps_checkpoint:
77
- params:
78
- every_n_train_steps: 25000
79
-
80
- image_logger:
81
- target: main.ImageLogger
82
- params:
83
- disabled: False
84
- batch_frequency: 1000
85
- max_images: 64
86
- increase_log_steps: True
87
- log_first_step: False
88
- log_images_kwargs:
89
- use_ema_scope: False
90
- N: 64
91
- n_rows: 8
92
-
93
- trainer:
94
- devices: 0,
95
- benchmark: True
96
- num_sanity_val_steps: 0
97
- accumulate_grad_batches: 1
98
- max_epochs: 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/toy/mnist.yaml DELETED
@@ -1,79 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- denoiser_config:
6
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
- params:
8
- scaling_config:
9
- target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
- params:
11
- sigma_data: 1.0
12
-
13
- network_config:
14
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
- params:
16
- in_channels: 1
17
- out_channels: 1
18
- model_channels: 32
19
- attention_resolutions: []
20
- num_res_blocks: 4
21
- channel_mult: [1, 2, 2]
22
- num_head_channels: 32
23
-
24
- first_stage_config:
25
- target: sgm.models.autoencoder.IdentityFirstStage
26
-
27
- loss_fn_config:
28
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
29
- params:
30
- loss_weighting_config:
31
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
32
- params:
33
- sigma_data: 1.0
34
- sigma_sampler_config:
35
- target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
36
-
37
- sampler_config:
38
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
39
- params:
40
- num_steps: 50
41
-
42
- discretization_config:
43
- target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
44
-
45
- data:
46
- target: sgm.data.mnist.MNISTLoader
47
- params:
48
- batch_size: 512
49
- num_workers: 1
50
-
51
- lightning:
52
- modelcheckpoint:
53
- params:
54
- every_n_train_steps: 5000
55
-
56
- callbacks:
57
- metrics_over_trainsteps_checkpoint:
58
- params:
59
- every_n_train_steps: 25000
60
-
61
- image_logger:
62
- target: main.ImageLogger
63
- params:
64
- disabled: False
65
- batch_frequency: 1000
66
- max_images: 64
67
- increase_log_steps: False
68
- log_first_step: False
69
- log_images_kwargs:
70
- use_ema_scope: False
71
- N: 64
72
- n_rows: 8
73
-
74
- trainer:
75
- devices: 0,
76
- benchmark: True
77
- num_sanity_val_steps: 0
78
- accumulate_grad_batches: 1
79
- max_epochs: 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/toy/mnist_cond.yaml DELETED
@@ -1,98 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- denoiser_config:
6
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
- params:
8
- scaling_config:
9
- target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
- params:
11
- sigma_data: 1.0
12
-
13
- network_config:
14
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
- params:
16
- in_channels: 1
17
- out_channels: 1
18
- model_channels: 32
19
- attention_resolutions: []
20
- num_res_blocks: 4
21
- channel_mult: [1, 2, 2]
22
- num_head_channels: 32
23
- num_classes: sequential
24
- adm_in_channels: 128
25
-
26
- conditioner_config:
27
- target: sgm.modules.GeneralConditioner
28
- params:
29
- emb_models:
30
- - is_trainable: True
31
- input_key: cls
32
- ucg_rate: 0.2
33
- target: sgm.modules.encoders.modules.ClassEmbedder
34
- params:
35
- embed_dim: 128
36
- n_classes: 10
37
-
38
- first_stage_config:
39
- target: sgm.models.autoencoder.IdentityFirstStage
40
-
41
- loss_fn_config:
42
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
43
- params:
44
- loss_weighting_config:
45
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
46
- params:
47
- sigma_data: 1.0
48
- sigma_sampler_config:
49
- target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
50
-
51
- sampler_config:
52
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
53
- params:
54
- num_steps: 50
55
-
56
- discretization_config:
57
- target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
58
-
59
- guider_config:
60
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
61
- params:
62
- scale: 3.0
63
-
64
- data:
65
- target: sgm.data.mnist.MNISTLoader
66
- params:
67
- batch_size: 512
68
- num_workers: 1
69
-
70
- lightning:
71
- modelcheckpoint:
72
- params:
73
- every_n_train_steps: 5000
74
-
75
- callbacks:
76
- metrics_over_trainsteps_checkpoint:
77
- params:
78
- every_n_train_steps: 25000
79
-
80
- image_logger:
81
- target: main.ImageLogger
82
- params:
83
- disabled: False
84
- batch_frequency: 1000
85
- max_images: 16
86
- increase_log_steps: True
87
- log_first_step: False
88
- log_images_kwargs:
89
- use_ema_scope: False
90
- N: 16
91
- n_rows: 4
92
-
93
- trainer:
94
- devices: 0,
95
- benchmark: True
96
- num_sanity_val_steps: 0
97
- accumulate_grad_batches: 1
98
- max_epochs: 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/toy/mnist_cond_discrete_eps.yaml DELETED
@@ -1,103 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- denoiser_config:
6
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
7
- params:
8
- num_idx: 1000
9
-
10
- scaling_config:
11
- target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
12
- discretization_config:
13
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
14
-
15
- network_config:
16
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
17
- params:
18
- in_channels: 1
19
- out_channels: 1
20
- model_channels: 32
21
- attention_resolutions: []
22
- num_res_blocks: 4
23
- channel_mult: [1, 2, 2]
24
- num_head_channels: 32
25
- num_classes: sequential
26
- adm_in_channels: 128
27
-
28
- conditioner_config:
29
- target: sgm.modules.GeneralConditioner
30
- params:
31
- emb_models:
32
- - is_trainable: True
33
- input_key: cls
34
- ucg_rate: 0.2
35
- target: sgm.modules.encoders.modules.ClassEmbedder
36
- params:
37
- embed_dim: 128
38
- n_classes: 10
39
-
40
- first_stage_config:
41
- target: sgm.models.autoencoder.IdentityFirstStage
42
-
43
- loss_fn_config:
44
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
45
- params:
46
- loss_weighting_config:
47
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
48
- sigma_sampler_config:
49
- target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
50
- params:
51
- num_idx: 1000
52
-
53
- discretization_config:
54
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
55
-
56
- sampler_config:
57
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
58
- params:
59
- num_steps: 50
60
-
61
- discretization_config:
62
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
63
-
64
- guider_config:
65
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
66
- params:
67
- scale: 5.0
68
-
69
- data:
70
- target: sgm.data.mnist.MNISTLoader
71
- params:
72
- batch_size: 512
73
- num_workers: 1
74
-
75
- lightning:
76
- modelcheckpoint:
77
- params:
78
- every_n_train_steps: 5000
79
-
80
- callbacks:
81
- metrics_over_trainsteps_checkpoint:
82
- params:
83
- every_n_train_steps: 25000
84
-
85
- image_logger:
86
- target: main.ImageLogger
87
- params:
88
- disabled: False
89
- batch_frequency: 1000
90
- max_images: 16
91
- increase_log_steps: True
92
- log_first_step: False
93
- log_images_kwargs:
94
- use_ema_scope: False
95
- N: 16
96
- n_rows: 4
97
-
98
- trainer:
99
- devices: 0,
100
- benchmark: True
101
- num_sanity_val_steps: 0
102
- accumulate_grad_batches: 1
103
- max_epochs: 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/toy/mnist_cond_l1_loss.yaml DELETED
@@ -1,99 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- denoiser_config:
6
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
7
- params:
8
- scaling_config:
9
- target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
10
- params:
11
- sigma_data: 1.0
12
-
13
- network_config:
14
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
15
- params:
16
- in_channels: 1
17
- out_channels: 1
18
- model_channels: 32
19
- attention_resolutions: []
20
- num_res_blocks: 4
21
- channel_mult: [1, 2, 2]
22
- num_head_channels: 32
23
- num_classes: sequential
24
- adm_in_channels: 128
25
-
26
- conditioner_config:
27
- target: sgm.modules.GeneralConditioner
28
- params:
29
- emb_models:
30
- - is_trainable: True
31
- input_key: cls
32
- ucg_rate: 0.2
33
- target: sgm.modules.encoders.modules.ClassEmbedder
34
- params:
35
- embed_dim: 128
36
- n_classes: 10
37
-
38
- first_stage_config:
39
- target: sgm.models.autoencoder.IdentityFirstStage
40
-
41
- loss_fn_config:
42
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
43
- params:
44
- loss_type: l1
45
- loss_weighting_config:
46
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
47
- params:
48
- sigma_data: 1.0
49
- sigma_sampler_config:
50
- target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
51
-
52
- sampler_config:
53
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
54
- params:
55
- num_steps: 50
56
-
57
- discretization_config:
58
- target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
59
-
60
- guider_config:
61
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
62
- params:
63
- scale: 3.0
64
-
65
- data:
66
- target: sgm.data.mnist.MNISTLoader
67
- params:
68
- batch_size: 512
69
- num_workers: 1
70
-
71
- lightning:
72
- modelcheckpoint:
73
- params:
74
- every_n_train_steps: 5000
75
-
76
- callbacks:
77
- metrics_over_trainsteps_checkpoint:
78
- params:
79
- every_n_train_steps: 25000
80
-
81
- image_logger:
82
- target: main.ImageLogger
83
- params:
84
- disabled: False
85
- batch_frequency: 1000
86
- max_images: 64
87
- increase_log_steps: True
88
- log_first_step: False
89
- log_images_kwargs:
90
- use_ema_scope: False
91
- N: 64
92
- n_rows: 8
93
-
94
- trainer:
95
- devices: 0,
96
- benchmark: True
97
- num_sanity_val_steps: 0
98
- accumulate_grad_batches: 1
99
- max_epochs: 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/toy/mnist_cond_with_ema.yaml DELETED
@@ -1,100 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- use_ema: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
9
- params:
10
- scaling_config:
11
- target: sgm.modules.diffusionmodules.denoiser_scaling.EDMScaling
12
- params:
13
- sigma_data: 1.0
14
-
15
- network_config:
16
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
17
- params:
18
- in_channels: 1
19
- out_channels: 1
20
- model_channels: 32
21
- attention_resolutions: []
22
- num_res_blocks: 4
23
- channel_mult: [1, 2, 2]
24
- num_head_channels: 32
25
- num_classes: sequential
26
- adm_in_channels: 128
27
-
28
- conditioner_config:
29
- target: sgm.modules.GeneralConditioner
30
- params:
31
- emb_models:
32
- - is_trainable: True
33
- input_key: cls
34
- ucg_rate: 0.2
35
- target: sgm.modules.encoders.modules.ClassEmbedder
36
- params:
37
- embed_dim: 128
38
- n_classes: 10
39
-
40
- first_stage_config:
41
- target: sgm.models.autoencoder.IdentityFirstStage
42
-
43
- loss_fn_config:
44
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
45
- params:
46
- loss_weighting_config:
47
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
48
- params:
49
- sigma_data: 1.0
50
- sigma_sampler_config:
51
- target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
52
-
53
- sampler_config:
54
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
55
- params:
56
- num_steps: 50
57
-
58
- discretization_config:
59
- target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
60
-
61
- guider_config:
62
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
63
- params:
64
- scale: 3.0
65
-
66
- data:
67
- target: sgm.data.mnist.MNISTLoader
68
- params:
69
- batch_size: 512
70
- num_workers: 1
71
-
72
- lightning:
73
- modelcheckpoint:
74
- params:
75
- every_n_train_steps: 5000
76
-
77
- callbacks:
78
- metrics_over_trainsteps_checkpoint:
79
- params:
80
- every_n_train_steps: 25000
81
-
82
- image_logger:
83
- target: main.ImageLogger
84
- params:
85
- disabled: False
86
- batch_frequency: 1000
87
- max_images: 64
88
- increase_log_steps: True
89
- log_first_step: False
90
- log_images_kwargs:
91
- use_ema_scope: False
92
- N: 64
93
- n_rows: 8
94
-
95
- trainer:
96
- devices: 0,
97
- benchmark: True
98
- num_sanity_val_steps: 0
99
- accumulate_grad_batches: 1
100
- max_epochs: 20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/txt2img-clipl-legacy-ucg-training.yaml DELETED
@@ -1,182 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- scale_factor: 0.13025
6
- disable_first_stage_autocast: True
7
- log_keys:
8
- - txt
9
-
10
- scheduler_config:
11
- target: sgm.lr_scheduler.LambdaLinearScheduler
12
- params:
13
- warm_up_steps: [10000]
14
- cycle_lengths: [10000000000000]
15
- f_start: [1.e-6]
16
- f_max: [1.]
17
- f_min: [1.]
18
-
19
- denoiser_config:
20
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
21
- params:
22
- num_idx: 1000
23
-
24
- scaling_config:
25
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
26
- discretization_config:
27
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
28
-
29
- network_config:
30
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
31
- params:
32
- use_checkpoint: True
33
- in_channels: 4
34
- out_channels: 4
35
- model_channels: 320
36
- attention_resolutions: [1, 2, 4]
37
- num_res_blocks: 2
38
- channel_mult: [1, 2, 4, 4]
39
- num_head_channels: 64
40
- num_classes: sequential
41
- adm_in_channels: 1792
42
- num_heads: 1
43
- transformer_depth: 1
44
- context_dim: 768
45
- spatial_transformer_attn_type: softmax-xformers
46
-
47
- conditioner_config:
48
- target: sgm.modules.GeneralConditioner
49
- params:
50
- emb_models:
51
- - is_trainable: True
52
- input_key: txt
53
- ucg_rate: 0.1
54
- legacy_ucg_value: ""
55
- target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
56
- params:
57
- always_return_pooled: True
58
-
59
- - is_trainable: False
60
- ucg_rate: 0.1
61
- input_key: original_size_as_tuple
62
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
63
- params:
64
- outdim: 256
65
-
66
- - is_trainable: False
67
- input_key: crop_coords_top_left
68
- ucg_rate: 0.1
69
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
70
- params:
71
- outdim: 256
72
-
73
- first_stage_config:
74
- target: sgm.models.autoencoder.AutoencoderKL
75
- params:
76
- ckpt_path: CKPT_PATH
77
- embed_dim: 4
78
- monitor: val/rec_loss
79
- ddconfig:
80
- attn_type: vanilla-xformers
81
- double_z: true
82
- z_channels: 4
83
- resolution: 256
84
- in_channels: 3
85
- out_ch: 3
86
- ch: 128
87
- ch_mult: [ 1, 2, 4, 4 ]
88
- num_res_blocks: 2
89
- attn_resolutions: [ ]
90
- dropout: 0.0
91
- lossconfig:
92
- target: torch.nn.Identity
93
-
94
- loss_fn_config:
95
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
96
- params:
97
- loss_weighting_config:
98
- target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
99
- sigma_sampler_config:
100
- target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
101
- params:
102
- num_idx: 1000
103
-
104
- discretization_config:
105
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
106
-
107
- sampler_config:
108
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
109
- params:
110
- num_steps: 50
111
-
112
- discretization_config:
113
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
114
-
115
- guider_config:
116
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
117
- params:
118
- scale: 7.5
119
-
120
- data:
121
- target: sgm.data.dataset.StableDataModuleFromConfig
122
- params:
123
- train:
124
- datapipeline:
125
- urls:
126
- # USER: adapt this path the root of your custom dataset
127
- - DATA_PATH
128
- pipeline_config:
129
- shardshuffle: 10000
130
- sample_shuffle: 10000 # USER: you might wanna adapt depending on your available RAM
131
-
132
- decoders:
133
- - pil
134
-
135
- postprocessors:
136
- - target: sdata.mappers.TorchVisionImageTransforms
137
- params:
138
- key: jpg # USER: you might wanna adapt this for your custom dataset
139
- transforms:
140
- - target: torchvision.transforms.Resize
141
- params:
142
- size: 256
143
- interpolation: 3
144
- - target: torchvision.transforms.ToTensor
145
- - target: sdata.mappers.Rescaler
146
- - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
147
- # USER: you might wanna use non-default parameters due to your custom dataset
148
-
149
- loader:
150
- batch_size: 64
151
- num_workers: 6
152
-
153
- lightning:
154
- modelcheckpoint:
155
- params:
156
- every_n_train_steps: 5000
157
-
158
- callbacks:
159
- metrics_over_trainsteps_checkpoint:
160
- params:
161
- every_n_train_steps: 25000
162
-
163
- image_logger:
164
- target: main.ImageLogger
165
- params:
166
- disabled: False
167
- enable_autocast: False
168
- batch_frequency: 1000
169
- max_images: 8
170
- increase_log_steps: True
171
- log_first_step: False
172
- log_images_kwargs:
173
- use_ema_scope: False
174
- N: 8
175
- n_rows: 2
176
-
177
- trainer:
178
- devices: 0,
179
- benchmark: True
180
- num_sanity_val_steps: 0
181
- accumulate_grad_batches: 1
182
- max_epochs: 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/example_training/txt2img-clipl.yaml DELETED
@@ -1,184 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-4
3
- target: sgm.models.diffusion.DiffusionEngine
4
- params:
5
- scale_factor: 0.13025
6
- disable_first_stage_autocast: True
7
- log_keys:
8
- - txt
9
-
10
- scheduler_config:
11
- target: sgm.lr_scheduler.LambdaLinearScheduler
12
- params:
13
- warm_up_steps: [10000]
14
- cycle_lengths: [10000000000000]
15
- f_start: [1.e-6]
16
- f_max: [1.]
17
- f_min: [1.]
18
-
19
- denoiser_config:
20
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
21
- params:
22
- num_idx: 1000
23
-
24
- scaling_config:
25
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
26
- discretization_config:
27
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
28
-
29
- network_config:
30
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
31
- params:
32
- use_checkpoint: True
33
- in_channels: 4
34
- out_channels: 4
35
- model_channels: 320
36
- attention_resolutions: [1, 2, 4]
37
- num_res_blocks: 2
38
- channel_mult: [1, 2, 4, 4]
39
- num_head_channels: 64
40
- num_classes: sequential
41
- adm_in_channels: 1792
42
- num_heads: 1
43
- transformer_depth: 1
44
- context_dim: 768
45
- spatial_transformer_attn_type: softmax-xformers
46
-
47
- conditioner_config:
48
- target: sgm.modules.GeneralConditioner
49
- params:
50
- emb_models:
51
- - is_trainable: True
52
- input_key: txt
53
- ucg_rate: 0.1
54
- legacy_ucg_value: ""
55
- target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
56
- params:
57
- always_return_pooled: True
58
-
59
- - is_trainable: False
60
- ucg_rate: 0.1
61
- input_key: original_size_as_tuple
62
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
63
- params:
64
- outdim: 256
65
-
66
- - is_trainable: False
67
- input_key: crop_coords_top_left
68
- ucg_rate: 0.1
69
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
70
- params:
71
- outdim: 256
72
-
73
- first_stage_config:
74
- target: sgm.models.autoencoder.AutoencoderKL
75
- params:
76
- ckpt_path: CKPT_PATH
77
- embed_dim: 4
78
- monitor: val/rec_loss
79
- ddconfig:
80
- attn_type: vanilla-xformers
81
- double_z: true
82
- z_channels: 4
83
- resolution: 256
84
- in_channels: 3
85
- out_ch: 3
86
- ch: 128
87
- ch_mult: [1, 2, 4, 4]
88
- num_res_blocks: 2
89
- attn_resolutions: []
90
- dropout: 0.0
91
- lossconfig:
92
- target: torch.nn.Identity
93
-
94
- loss_fn_config:
95
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
96
- params:
97
- loss_weighting_config:
98
- target: sgm.modules.diffusionmodules.loss_weighting.EpsWeighting
99
- sigma_sampler_config:
100
- target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling
101
- params:
102
- num_idx: 1000
103
-
104
- discretization_config:
105
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
106
-
107
- sampler_config:
108
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
109
- params:
110
- num_steps: 50
111
-
112
- discretization_config:
113
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
114
-
115
- guider_config:
116
- target: sgm.modules.diffusionmodules.guiders.VanillaCFG
117
- params:
118
- scale: 7.5
119
-
120
- data:
121
- target: sgm.data.dataset.StableDataModuleFromConfig
122
- params:
123
- train:
124
- datapipeline:
125
- urls:
126
- # USER: adapt this path the root of your custom dataset
127
- - DATA_PATH
128
- pipeline_config:
129
- shardshuffle: 10000
130
- sample_shuffle: 10000
131
-
132
-
133
- decoders:
134
- - pil
135
-
136
- postprocessors:
137
- - target: sdata.mappers.TorchVisionImageTransforms
138
- params:
139
- key: jpg # USER: you might wanna adapt this for your custom dataset
140
- transforms:
141
- - target: torchvision.transforms.Resize
142
- params:
143
- size: 256
144
- interpolation: 3
145
- - target: torchvision.transforms.ToTensor
146
- - target: sdata.mappers.Rescaler
147
- # USER: you might wanna use non-default parameters due to your custom dataset
148
- - target: sdata.mappers.AddOriginalImageSizeAsTupleAndCropToSquare
149
- # USER: you might wanna use non-default parameters due to your custom dataset
150
-
151
- loader:
152
- batch_size: 64
153
- num_workers: 6
154
-
155
- lightning:
156
- modelcheckpoint:
157
- params:
158
- every_n_train_steps: 5000
159
-
160
- callbacks:
161
- metrics_over_trainsteps_checkpoint:
162
- params:
163
- every_n_train_steps: 25000
164
-
165
- image_logger:
166
- target: main.ImageLogger
167
- params:
168
- disabled: False
169
- enable_autocast: False
170
- batch_frequency: 1000
171
- max_images: 8
172
- increase_log_steps: True
173
- log_first_step: False
174
- log_images_kwargs:
175
- use_ema_scope: False
176
- N: 8
177
- n_rows: 2
178
-
179
- trainer:
180
- devices: 0,
181
- benchmark: True
182
- num_sanity_val_steps: 0
183
- accumulate_grad_batches: 1
184
- max_epochs: 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/sd_2_1.yaml DELETED
@@ -1,60 +0,0 @@
1
- model:
2
- target: sgm.models.diffusion.DiffusionEngine
3
- params:
4
- scale_factor: 0.18215
5
- disable_first_stage_autocast: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
- params:
10
- num_idx: 1000
11
-
12
- scaling_config:
13
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
- discretization_config:
15
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
-
17
- network_config:
18
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
- params:
20
- use_checkpoint: True
21
- in_channels: 4
22
- out_channels: 4
23
- model_channels: 320
24
- attention_resolutions: [4, 2, 1]
25
- num_res_blocks: 2
26
- channel_mult: [1, 2, 4, 4]
27
- num_head_channels: 64
28
- use_linear_in_transformer: True
29
- transformer_depth: 1
30
- context_dim: 1024
31
-
32
- conditioner_config:
33
- target: sgm.modules.GeneralConditioner
34
- params:
35
- emb_models:
36
- - is_trainable: False
37
- input_key: txt
38
- target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
39
- params:
40
- freeze: true
41
- layer: penultimate
42
-
43
- first_stage_config:
44
- target: sgm.models.autoencoder.AutoencoderKL
45
- params:
46
- embed_dim: 4
47
- monitor: val/rec_loss
48
- ddconfig:
49
- double_z: true
50
- z_channels: 4
51
- resolution: 256
52
- in_channels: 3
53
- out_ch: 3
54
- ch: 128
55
- ch_mult: [1, 2, 4, 4]
56
- num_res_blocks: 2
57
- attn_resolutions: []
58
- dropout: 0.0
59
- lossconfig:
60
- target: torch.nn.Identity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/sd_2_1_768.yaml DELETED
@@ -1,60 +0,0 @@
1
- model:
2
- target: sgm.models.diffusion.DiffusionEngine
3
- params:
4
- scale_factor: 0.18215
5
- disable_first_stage_autocast: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
- params:
10
- num_idx: 1000
11
-
12
- scaling_config:
13
- target: sgm.modules.diffusionmodules.denoiser_scaling.VScaling
14
- discretization_config:
15
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
-
17
- network_config:
18
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
- params:
20
- use_checkpoint: True
21
- in_channels: 4
22
- out_channels: 4
23
- model_channels: 320
24
- attention_resolutions: [4, 2, 1]
25
- num_res_blocks: 2
26
- channel_mult: [1, 2, 4, 4]
27
- num_head_channels: 64
28
- use_linear_in_transformer: True
29
- transformer_depth: 1
30
- context_dim: 1024
31
-
32
- conditioner_config:
33
- target: sgm.modules.GeneralConditioner
34
- params:
35
- emb_models:
36
- - is_trainable: False
37
- input_key: txt
38
- target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder
39
- params:
40
- freeze: true
41
- layer: penultimate
42
-
43
- first_stage_config:
44
- target: sgm.models.autoencoder.AutoencoderKL
45
- params:
46
- embed_dim: 4
47
- monitor: val/rec_loss
48
- ddconfig:
49
- double_z: true
50
- z_channels: 4
51
- resolution: 256
52
- in_channels: 3
53
- out_ch: 3
54
- ch: 128
55
- ch_mult: [1, 2, 4, 4]
56
- num_res_blocks: 2
57
- attn_resolutions: []
58
- dropout: 0.0
59
- lossconfig:
60
- target: torch.nn.Identity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/sd_xl_base.yaml DELETED
@@ -1,93 +0,0 @@
1
- model:
2
- target: sgm.models.diffusion.DiffusionEngine
3
- params:
4
- scale_factor: 0.13025
5
- disable_first_stage_autocast: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
- params:
10
- num_idx: 1000
11
-
12
- scaling_config:
13
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
- discretization_config:
15
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
-
17
- network_config:
18
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
- params:
20
- adm_in_channels: 2816
21
- num_classes: sequential
22
- use_checkpoint: True
23
- in_channels: 4
24
- out_channels: 4
25
- model_channels: 320
26
- attention_resolutions: [4, 2]
27
- num_res_blocks: 2
28
- channel_mult: [1, 2, 4]
29
- num_head_channels: 64
30
- use_linear_in_transformer: True
31
- transformer_depth: [1, 2, 10]
32
- context_dim: 2048
33
- spatial_transformer_attn_type: softmax-xformers
34
-
35
- conditioner_config:
36
- target: sgm.modules.GeneralConditioner
37
- params:
38
- emb_models:
39
- - is_trainable: False
40
- input_key: txt
41
- target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
42
- params:
43
- layer: hidden
44
- layer_idx: 11
45
-
46
- - is_trainable: False
47
- input_key: txt
48
- target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
49
- params:
50
- arch: ViT-bigG-14
51
- version: laion2b_s39b_b160k
52
- freeze: True
53
- layer: penultimate
54
- always_return_pooled: True
55
- legacy: False
56
-
57
- - is_trainable: False
58
- input_key: original_size_as_tuple
59
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
60
- params:
61
- outdim: 256
62
-
63
- - is_trainable: False
64
- input_key: crop_coords_top_left
65
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
66
- params:
67
- outdim: 256
68
-
69
- - is_trainable: False
70
- input_key: target_size_as_tuple
71
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
72
- params:
73
- outdim: 256
74
-
75
- first_stage_config:
76
- target: sgm.models.autoencoder.AutoencoderKL
77
- params:
78
- embed_dim: 4
79
- monitor: val/rec_loss
80
- ddconfig:
81
- attn_type: vanilla-xformers
82
- double_z: true
83
- z_channels: 4
84
- resolution: 256
85
- in_channels: 3
86
- out_ch: 3
87
- ch: 128
88
- ch_mult: [1, 2, 4, 4]
89
- num_res_blocks: 2
90
- attn_resolutions: []
91
- dropout: 0.0
92
- lossconfig:
93
- target: torch.nn.Identity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/sd_xl_refiner.yaml DELETED
@@ -1,86 +0,0 @@
1
- model:
2
- target: sgm.models.diffusion.DiffusionEngine
3
- params:
4
- scale_factor: 0.13025
5
- disable_first_stage_autocast: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
9
- params:
10
- num_idx: 1000
11
-
12
- scaling_config:
13
- target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
14
- discretization_config:
15
- target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
16
-
17
- network_config:
18
- target: sgm.modules.diffusionmodules.openaimodel.UNetModel
19
- params:
20
- adm_in_channels: 2560
21
- num_classes: sequential
22
- use_checkpoint: True
23
- in_channels: 4
24
- out_channels: 4
25
- model_channels: 384
26
- attention_resolutions: [4, 2]
27
- num_res_blocks: 2
28
- channel_mult: [1, 2, 4, 4]
29
- num_head_channels: 64
30
- use_linear_in_transformer: True
31
- transformer_depth: 4
32
- context_dim: [1280, 1280, 1280, 1280]
33
- spatial_transformer_attn_type: softmax-xformers
34
-
35
- conditioner_config:
36
- target: sgm.modules.GeneralConditioner
37
- params:
38
- emb_models:
39
- - is_trainable: False
40
- input_key: txt
41
- target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
42
- params:
43
- arch: ViT-bigG-14
44
- version: laion2b_s39b_b160k
45
- legacy: False
46
- freeze: True
47
- layer: penultimate
48
- always_return_pooled: True
49
-
50
- - is_trainable: False
51
- input_key: original_size_as_tuple
52
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
- params:
54
- outdim: 256
55
-
56
- - is_trainable: False
57
- input_key: crop_coords_top_left
58
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
- params:
60
- outdim: 256
61
-
62
- - is_trainable: False
63
- input_key: aesthetic_score
64
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
65
- params:
66
- outdim: 256
67
-
68
- first_stage_config:
69
- target: sgm.models.autoencoder.AutoencoderKL
70
- params:
71
- embed_dim: 4
72
- monitor: val/rec_loss
73
- ddconfig:
74
- attn_type: vanilla-xformers
75
- double_z: true
76
- z_channels: 4
77
- resolution: 256
78
- in_channels: 3
79
- out_ch: 3
80
- ch: 128
81
- ch_mult: [1, 2, 4, 4]
82
- num_res_blocks: 2
83
- attn_resolutions: []
84
- dropout: 0.0
85
- lossconfig:
86
- target: torch.nn.Identity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/svd.yaml DELETED
@@ -1,131 +0,0 @@
1
- model:
2
- target: sgm.models.diffusion.DiffusionEngine
3
- params:
4
- scale_factor: 0.18215
5
- disable_first_stage_autocast: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
9
- params:
10
- scaling_config:
11
- target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
12
-
13
- network_config:
14
- target: sgm.modules.diffusionmodules.video_model.VideoUNet
15
- params:
16
- adm_in_channels: 768
17
- num_classes: sequential
18
- use_checkpoint: True
19
- in_channels: 8
20
- out_channels: 4
21
- model_channels: 320
22
- attention_resolutions: [4, 2, 1]
23
- num_res_blocks: 2
24
- channel_mult: [1, 2, 4, 4]
25
- num_head_channels: 64
26
- use_linear_in_transformer: True
27
- transformer_depth: 1
28
- context_dim: 1024
29
- spatial_transformer_attn_type: softmax-xformers
30
- extra_ff_mix_layer: True
31
- use_spatial_context: True
32
- merge_strategy: learned_with_images
33
- video_kernel_size: [3, 1, 1]
34
-
35
- conditioner_config:
36
- target: sgm.modules.GeneralConditioner
37
- params:
38
- emb_models:
39
- - is_trainable: False
40
- input_key: cond_frames_without_noise
41
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
42
- params:
43
- n_cond_frames: 1
44
- n_copies: 1
45
- open_clip_embedding_config:
46
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
47
- params:
48
- freeze: True
49
-
50
- - input_key: fps_id
51
- is_trainable: False
52
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
- params:
54
- outdim: 256
55
-
56
- - input_key: motion_bucket_id
57
- is_trainable: False
58
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
- params:
60
- outdim: 256
61
-
62
- - input_key: cond_frames
63
- is_trainable: False
64
- target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
65
- params:
66
- disable_encoder_autocast: True
67
- n_cond_frames: 1
68
- n_copies: 1
69
- is_ae: True
70
- encoder_config:
71
- target: sgm.models.autoencoder.AutoencoderKLModeOnly
72
- params:
73
- embed_dim: 4
74
- monitor: val/rec_loss
75
- ddconfig:
76
- attn_type: vanilla-xformers
77
- double_z: True
78
- z_channels: 4
79
- resolution: 256
80
- in_channels: 3
81
- out_ch: 3
82
- ch: 128
83
- ch_mult: [1, 2, 4, 4]
84
- num_res_blocks: 2
85
- attn_resolutions: []
86
- dropout: 0.0
87
- lossconfig:
88
- target: torch.nn.Identity
89
-
90
- - input_key: cond_aug
91
- is_trainable: False
92
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
93
- params:
94
- outdim: 256
95
-
96
- first_stage_config:
97
- target: sgm.models.autoencoder.AutoencodingEngine
98
- params:
99
- loss_config:
100
- target: torch.nn.Identity
101
- regularizer_config:
102
- target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
103
- encoder_config:
104
- target: sgm.modules.diffusionmodules.model.Encoder
105
- params:
106
- attn_type: vanilla
107
- double_z: True
108
- z_channels: 4
109
- resolution: 256
110
- in_channels: 3
111
- out_ch: 3
112
- ch: 128
113
- ch_mult: [1, 2, 4, 4]
114
- num_res_blocks: 2
115
- attn_resolutions: []
116
- dropout: 0.0
117
- decoder_config:
118
- target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
119
- params:
120
- attn_type: vanilla
121
- double_z: True
122
- z_channels: 4
123
- resolution: 256
124
- in_channels: 3
125
- out_ch: 3
126
- ch: 128
127
- ch_mult: [1, 2, 4, 4]
128
- num_res_blocks: 2
129
- attn_resolutions: []
130
- dropout: 0.0
131
- video_kernel_size: [3, 1, 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/svd_image_decoder.yaml DELETED
@@ -1,114 +0,0 @@
1
- model:
2
- target: sgm.models.diffusion.DiffusionEngine
3
- params:
4
- scale_factor: 0.18215
5
- disable_first_stage_autocast: True
6
-
7
- denoiser_config:
8
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
9
- params:
10
- scaling_config:
11
- target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
12
-
13
- network_config:
14
- target: sgm.modules.diffusionmodules.video_model.VideoUNet
15
- params:
16
- adm_in_channels: 768
17
- num_classes: sequential
18
- use_checkpoint: True
19
- in_channels: 8
20
- out_channels: 4
21
- model_channels: 320
22
- attention_resolutions: [4, 2, 1]
23
- num_res_blocks: 2
24
- channel_mult: [1, 2, 4, 4]
25
- num_head_channels: 64
26
- use_linear_in_transformer: True
27
- transformer_depth: 1
28
- context_dim: 1024
29
- spatial_transformer_attn_type: softmax-xformers
30
- extra_ff_mix_layer: True
31
- use_spatial_context: True
32
- merge_strategy: learned_with_images
33
- video_kernel_size: [3, 1, 1]
34
-
35
- conditioner_config:
36
- target: sgm.modules.GeneralConditioner
37
- params:
38
- emb_models:
39
- - is_trainable: False
40
- input_key: cond_frames_without_noise
41
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
42
- params:
43
- n_cond_frames: 1
44
- n_copies: 1
45
- open_clip_embedding_config:
46
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
47
- params:
48
- freeze: True
49
-
50
- - input_key: fps_id
51
- is_trainable: False
52
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
53
- params:
54
- outdim: 256
55
-
56
- - input_key: motion_bucket_id
57
- is_trainable: False
58
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
59
- params:
60
- outdim: 256
61
-
62
- - input_key: cond_frames
63
- is_trainable: False
64
- target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
65
- params:
66
- disable_encoder_autocast: True
67
- n_cond_frames: 1
68
- n_copies: 1
69
- is_ae: True
70
- encoder_config:
71
- target: sgm.models.autoencoder.AutoencoderKLModeOnly
72
- params:
73
- embed_dim: 4
74
- monitor: val/rec_loss
75
- ddconfig:
76
- attn_type: vanilla-xformers
77
- double_z: True
78
- z_channels: 4
79
- resolution: 256
80
- in_channels: 3
81
- out_ch: 3
82
- ch: 128
83
- ch_mult: [1, 2, 4, 4]
84
- num_res_blocks: 2
85
- attn_resolutions: []
86
- dropout: 0.0
87
- lossconfig:
88
- target: torch.nn.Identity
89
-
90
- - input_key: cond_aug
91
- is_trainable: False
92
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
93
- params:
94
- outdim: 256
95
-
96
- first_stage_config:
97
- target: sgm.models.autoencoder.AutoencoderKL
98
- params:
99
- embed_dim: 4
100
- monitor: val/rec_loss
101
- ddconfig:
102
- attn_type: vanilla-xformers
103
- double_z: True
104
- z_channels: 4
105
- resolution: 256
106
- in_channels: 3
107
- out_ch: 3
108
- ch: 128
109
- ch_mult: [1, 2, 4, 4]
110
- num_res_blocks: 2
111
- attn_resolutions: []
112
- dropout: 0.0
113
- lossconfig:
114
- target: torch.nn.Identity
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/inference/svd_mv.yaml DELETED
@@ -1,202 +0,0 @@
1
- model:
2
- base_learning_rate: 1.0e-05
3
- target: sgm.models.video_diffusion.DiffusionEngine
4
- params:
5
- ckpt_path: ckpts/svd_xt.safetensors
6
- scale_factor: 0.18215
7
- disable_first_stage_autocast: true
8
- scheduler_config:
9
- target: sgm.lr_scheduler.LambdaLinearScheduler
10
- params:
11
- warm_up_steps:
12
- - 1
13
- cycle_lengths:
14
- - 10000000000000
15
- f_start:
16
- - 1.0e-06
17
- f_max:
18
- - 1.0
19
- f_min:
20
- - 1.0
21
- denoiser_config:
22
- target: sgm.modules.diffusionmodules.denoiser.Denoiser
23
- params:
24
- scaling_config:
25
- target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
26
- network_config:
27
- target: sgm.modules.diffusionmodules.video_model.VideoUNet
28
- params:
29
- adm_in_channels: 768
30
- num_classes: sequential
31
- use_checkpoint: true
32
- in_channels: 8
33
- out_channels: 4
34
- model_channels: 320
35
- attention_resolutions:
36
- - 4
37
- - 2
38
- - 1
39
- num_res_blocks: 2
40
- channel_mult:
41
- - 1
42
- - 2
43
- - 4
44
- - 4
45
- num_head_channels: 64
46
- use_linear_in_transformer: true
47
- transformer_depth: 1
48
- context_dim: 1024
49
- spatial_transformer_attn_type: softmax-xformers
50
- extra_ff_mix_layer: true
51
- use_spatial_context: true
52
- merge_strategy: learned_with_images
53
- video_kernel_size:
54
- - 3
55
- - 1
56
- - 1
57
- conditioner_config:
58
- target: sgm.modules.GeneralConditioner
59
- params:
60
- emb_models:
61
- - is_trainable: false
62
- ucg_rate: 0.2
63
- input_key: cond_frames_without_noise
64
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
65
- params:
66
- n_cond_frames: 1
67
- n_copies: 1
68
- open_clip_embedding_config:
69
- target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
70
- params:
71
- freeze: true
72
- - input_key: fps_id
73
- is_trainable: true
74
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
75
- params:
76
- outdim: 256
77
- - input_key: motion_bucket_id
78
- is_trainable: true
79
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
80
- params:
81
- outdim: 256
82
- - input_key: cond_frames
83
- is_trainable: false
84
- ucg_rate: 0.2
85
- target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
86
- params:
87
- disable_encoder_autocast: true
88
- n_cond_frames: 1
89
- n_copies: 1
90
- is_ae: true
91
- encoder_config:
92
- target: sgm.models.autoencoder.AutoencoderKLModeOnly
93
- params:
94
- embed_dim: 4
95
- monitor: val/rec_loss
96
- ddconfig:
97
- attn_type: vanilla-xformers
98
- double_z: true
99
- z_channels: 4
100
- resolution: 256
101
- in_channels: 3
102
- out_ch: 3
103
- ch: 128
104
- ch_mult:
105
- - 1
106
- - 2
107
- - 4
108
- - 4
109
- num_res_blocks: 2
110
- attn_resolutions: []
111
- dropout: 0.0
112
- lossconfig:
113
- target: torch.nn.Identity
114
- - input_key: cond_aug
115
- is_trainable: true
116
- target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
117
- params:
118
- outdim: 256
119
- first_stage_config:
120
- target: sgm.models.autoencoder.AutoencodingEngine
121
- params:
122
- loss_config:
123
- target: torch.nn.Identity
124
- regularizer_config:
125
- target: sgm.modules.autoencoding.regularizers.DiagonalGaussianRegularizer
126
- encoder_config:
127
- target: sgm.modules.diffusionmodules.model.Encoder
128
- params:
129
- attn_type: vanilla
130
- double_z: true
131
- z_channels: 4
132
- resolution: 256
133
- in_channels: 3
134
- out_ch: 3
135
- ch: 128
136
- ch_mult:
137
- - 1
138
- - 2
139
- - 4
140
- - 4
141
- num_res_blocks: 2
142
- attn_resolutions: []
143
- dropout: 0.0
144
- decoder_config:
145
- target: sgm.modules.autoencoding.temporal_ae.VideoDecoder
146
- params:
147
- attn_type: vanilla
148
- double_z: true
149
- z_channels: 4
150
- resolution: 256
151
- in_channels: 3
152
- out_ch: 3
153
- ch: 128
154
- ch_mult:
155
- - 1
156
- - 2
157
- - 4
158
- - 4
159
- num_res_blocks: 2
160
- attn_resolutions: []
161
- dropout: 0.0
162
- video_kernel_size:
163
- - 3
164
- - 1
165
- - 1
166
- sampler_config:
167
- target: sgm.modules.diffusionmodules.sampling.EulerEDMSampler
168
- params:
169
- num_steps: 30
170
- discretization_config:
171
- target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
172
- params:
173
- sigma_max: 700.0
174
- guider_config:
175
- target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
176
- params:
177
- max_scale: 2.5
178
- min_scale: 1.0
179
- num_frames: 24
180
- loss_fn_config:
181
- target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
182
- params:
183
- batch2model_keys:
184
- - num_video_frames
185
- - image_only_indicator
186
- loss_weighting_config:
187
- target: sgm.modules.diffusionmodules.loss_weighting.EDMWeighting
188
- params:
189
- sigma_data: 1.0
190
- sigma_sampler_config:
191
- target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
192
- params:
193
- p_mean: 0.3
194
- p_std: 1.2
195
- data:
196
- target: sgm.data.objaverse.ObjaverseSpiralDataset
197
- params:
198
- root_dir: /mnt/mfs/zilong.chen/Downloads/objaverse-ndd-samples
199
- random_front: true
200
- batch_size: 2
201
- num_workers: 16
202
- cond_aug_mean: -0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
configs/instant-mesh-base.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config:
2
+ target: src.models.lrm_mesh.InstantMesh
3
+ params:
4
+ encoder_feat_dim: 768
5
+ encoder_freeze: false
6
+ encoder_model_name: facebook/dino-vitb16
7
+ transformer_dim: 1024
8
+ transformer_layers: 12
9
+ transformer_heads: 16
10
+ triplane_low_res: 32
11
+ triplane_high_res: 64
12
+ triplane_dim: 40
13
+ rendering_samples_per_ray: 96
14
+ grid_res: 128
15
+ grid_scale: 2.1
16
+
17
+
18
+ infer_config:
19
+ unet_path: ckpts/diffusion_pytorch_model.bin
20
+ model_path: ckpts/instant_mesh_base.ckpt
21
+ texture_resolution: 1024
22
+ render_resolution: 512
configs/instant-mesh-large.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config:
2
+ target: src.models.lrm_mesh.InstantMesh
3
+ params:
4
+ encoder_feat_dim: 768
5
+ encoder_freeze: false
6
+ encoder_model_name: facebook/dino-vitb16
7
+ transformer_dim: 1024
8
+ transformer_layers: 16
9
+ transformer_heads: 16
10
+ triplane_low_res: 32
11
+ triplane_high_res: 64
12
+ triplane_dim: 80
13
+ rendering_samples_per_ray: 128
14
+ grid_res: 128
15
+ grid_scale: 2.1
16
+
17
+
18
+ infer_config:
19
+ unet_path: ckpts/diffusion_pytorch_model.bin
20
+ model_path: ckpts/instant_mesh_large.ckpt
21
+ texture_resolution: 1024
22
+ render_resolution: 512
configs/instant-nerf-base.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config:
2
+ target: src.models.lrm.InstantNeRF
3
+ params:
4
+ encoder_feat_dim: 768
5
+ encoder_freeze: false
6
+ encoder_model_name: facebook/dino-vitb16
7
+ transformer_dim: 1024
8
+ transformer_layers: 12
9
+ transformer_heads: 16
10
+ triplane_low_res: 32
11
+ triplane_high_res: 64
12
+ triplane_dim: 40
13
+ rendering_samples_per_ray: 96
14
+
15
+
16
+ infer_config:
17
+ unet_path: ckpts/diffusion_pytorch_model.bin
18
+ model_path: ckpts/instant_nerf_base.ckpt
19
+ mesh_threshold: 10.0
20
+ mesh_resolution: 256
21
+ render_resolution: 384
configs/instant-nerf-large.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_config:
2
+ target: src.models.lrm.InstantNeRF
3
+ params:
4
+ encoder_feat_dim: 768
5
+ encoder_freeze: false
6
+ encoder_model_name: facebook/dino-vitb16
7
+ transformer_dim: 1024
8
+ transformer_layers: 16
9
+ transformer_heads: 16
10
+ triplane_low_res: 32
11
+ triplane_high_res: 64
12
+ triplane_dim: 80
13
+ rendering_samples_per_ray: 128
14
+
15
+
16
+ infer_config:
17
+ unet_path: ckpts/diffusion_pytorch_model.bin
18
+ model_path: ckpts/instant_nerf_large.ckpt
19
+ mesh_threshold: 10.0
20
+ mesh_resolution: 256
21
+ render_resolution: 384
examples/bird.jpg ADDED
examples/bubble_mart_blue.png ADDED
examples/cartoon_dinosaur.png ADDED
examples/cartoon_girl.jpg ADDED
examples/chair_armed.png ADDED
examples/chair_comfort.jpg ADDED
examples/chair_wood.jpg ADDED
examples/chest.jpg ADDED
examples/fruit_bycycle.jpg ADDED
examples/fruit_elephant.jpg ADDED
examples/genshin_building.png ADDED
examples/genshin_teapot.png ADDED
examples/hatsune_miku.png ADDED
examples/house2.jpg ADDED
examples/mushroom_teapot.jpg ADDED
examples/pikachu.png ADDED
examples/plant.jpg ADDED
examples/robot.jpg ADDED
examples/sea_turtle.png ADDED
examples/skating_shoe.jpg ADDED
examples/sorting_board.png ADDED
examples/sword.png ADDED