File size: 3,854 Bytes
8133633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
exp_root_dir: "outputs"
name: "image-to-shape-diffusion/clip-dino-rgb-pixart-lr2e4-ddim"
tag: "${rmspace:${system.shape_model_type}+n${data.n_samples}+pfeat${system.shape_model.point_feats}+lr${system.optimizer.args.lr},_}"
seed: 0

data_type: "objaverse-datamodule"
data:
  root_dir: ./data/objaverse
  data_type: "sdf" 
  sampling_strategy: random
  n_samples: 10240
  
  load_supervision: False
  supervision_type: "" 
  n_supervision: 0

  load_image: True              # whether to load images 
  image_data_path: ./data/objaverse/render+blender+singleview+nv20
  image_type: "rgb"             # rgb, normal
  idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]        # front view
  n_views: 1
  background_color: [0.5, 0.5, 0.5]
  marign_pix_dis: 30

  batch_size: 40
  num_workers: 16

system_type: "pixart-diffusion-system"
system:
  val_samples_json: "val_data/images/val_samples_rgb_image.json"
  z_scale_factor: 1.0
  guidance_scale: 7.5
  num_inference_steps: 50
  eta: 0.0
  extract_mesh_func: diffdmc

  shape_model_type: michelangelo-autoencoder
  shape_model:
    pretrained_model_name_or_path: /mnt/cfs/public/native3D/ckpts/michelangelo-autoencoder-l256-e64-ne8-nd16-scaleup.ckpt
    use_downsample: true
    downsample_ratio: 0.0625
    num_latents: 768
    use_multi_reso: false
    resolutions: [4096, 8192, 12288]
    sampling_prob: [0, 0, 1]
    embed_dim: 64
    point_feats: 3
    out_dim: 1
    num_freqs: 8
    include_pi: false
    heads: 12
    width: 768
    num_encoder_layers: 8
    num_decoder_layers: 16
    use_ln_post: true
    init_scale: 0.25
    qkv_bias: false
    use_flash: true
    use_checkpoint: true
    

  condition_model_type: "cond-embedder"
  condition_model:
    pretrained_clip_name_or_path: openai/clip-vit-large-patch14
    pretrained_dino_name_or_path: facebook/dinov2-base
    pretrained_tokenizer_name_or_path: openai/clip-vit-large-patch14
    freeze_modulation_clip: true
    freeze_modulation_dino: true
    encode_camera: false
    camera_embeds_dim: 0
    n_views: ${data.n_views}
    empty_embeds_ratio: 0.1
    normalize_embeds: false
    zero_uncond_embeds: true
    linear_proj_init: constant
    image_size_dino: 224
    image_size_clip: 224

  denoiser_model_type: "pixart-denoiser"
  denoiser_model:
    input_channels: ${system.shape_model.embed_dim}
    output_channels:  ${system.shape_model.embed_dim}
    n_ctx: ${system.shape_model.num_latents}
    width: 768
    layers: 32
    heads: 12
    context_dim: 1024
    init_scale: 1.0
    skip_ln: true
    variance_type: ${system.noise_scheduler.variance_type}
    use_checkpoint: true
    dit_block: DiTBlock

  noise_scheduler_type: "diffusers.schedulers.DDPMScheduler"
  noise_scheduler:
    num_train_timesteps: 1000
    beta_start: 0.00085
    beta_end: 0.012
    beta_schedule: "scaled_linear"
    variance_type: "fixed_small"
    clip_sample: false

  denoise_scheduler_type: "diffusers.schedulers.DDIMScheduler"
  denoise_scheduler:
    num_train_timesteps: 1000
    beta_start: 0.00085
    beta_end: 0.012
    beta_schedule: "scaled_linear"
    clip_sample: false   # clip sample to -1~1
    set_alpha_to_one: false
    steps_offset: 1

  loggers:
    wandb:
      enable: false
      project: "CraftsMan"
      name: image-to-shape-diffusion+${name}+${tag}

  loss:
    loss_type: "mse"
    lambda_diffusion: 1.

  optimizer:
    name: AdamW
    args:
      lr: 2.e-4
      betas: [0.9, 0.99]
      eps: 1.e-6

  scheduler:
    name: CosineAnnealingLR
    args:
      T_max: 5000
      eta_min: 1e-6

trainer:
  num_nodes: 1
  max_epochs: 100000
  log_every_n_steps: 5
  num_sanity_val_steps: 1
  check_val_every_n_epoch: 25
  enable_progress_bar: true
  precision: 16-mixed
  strategy: 'ddp_find_unused_parameters_true'

checkpoint:
  save_last: true
  save_top_k: -1
  every_n_train_steps: 5000