CraftsMan3D / configs /image-to-shape-diffusion /clip-dino-rgb-pixart-lr2e4-ddim.yaml
wyysf's picture
update to v1.5
8133633
exp_root_dir: "outputs"
name: "image-to-shape-diffusion/clip-dino-rgb-pixart-lr2e4-ddim"
tag: "${rmspace:${system.shape_model_type}+n${data.n_samples}+pfeat${system.shape_model.point_feats}+lr${system.optimizer.args.lr},_}"
seed: 0
data_type: "objaverse-datamodule"
data:
root_dir: ./data/objaverse
data_type: "sdf"
sampling_strategy: random
n_samples: 10240
load_supervision: False
supervision_type: ""
n_supervision: 0
load_image: True # whether to load images
image_data_path: ./data/objaverse/render+blender+singleview+nv20
image_type: "rgb" # rgb, normal
idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] # front view
n_views: 1
background_color: [0.5, 0.5, 0.5]
marign_pix_dis: 30
batch_size: 40
num_workers: 16
system_type: "pixart-diffusion-system"
system:
val_samples_json: "val_data/images/val_samples_rgb_image.json"
z_scale_factor: 1.0
guidance_scale: 7.5
num_inference_steps: 50
eta: 0.0
extract_mesh_func: diffdmc
shape_model_type: michelangelo-autoencoder
shape_model:
pretrained_model_name_or_path: /mnt/cfs/public/native3D/ckpts/michelangelo-autoencoder-l256-e64-ne8-nd16-scaleup.ckpt
use_downsample: true
downsample_ratio: 0.0625
num_latents: 768
use_multi_reso: false
resolutions: [4096, 8192, 12288]
sampling_prob: [0, 0, 1]
embed_dim: 64
point_feats: 3
out_dim: 1
num_freqs: 8
include_pi: false
heads: 12
width: 768
num_encoder_layers: 8
num_decoder_layers: 16
use_ln_post: true
init_scale: 0.25
qkv_bias: false
use_flash: true
use_checkpoint: true
condition_model_type: "cond-embedder"
condition_model:
pretrained_clip_name_or_path: openai/clip-vit-large-patch14
pretrained_dino_name_or_path: facebook/dinov2-base
pretrained_tokenizer_name_or_path: openai/clip-vit-large-patch14
freeze_modulation_clip: true
freeze_modulation_dino: true
encode_camera: false
camera_embeds_dim: 0
n_views: ${data.n_views}
empty_embeds_ratio: 0.1
normalize_embeds: false
zero_uncond_embeds: true
linear_proj_init: constant
image_size_dino: 224
image_size_clip: 224
denoiser_model_type: "pixart-denoiser"
denoiser_model:
input_channels: ${system.shape_model.embed_dim}
output_channels: ${system.shape_model.embed_dim}
n_ctx: ${system.shape_model.num_latents}
width: 768
layers: 32
heads: 12
context_dim: 1024
init_scale: 1.0
skip_ln: true
variance_type: ${system.noise_scheduler.variance_type}
use_checkpoint: true
dit_block: DiTBlock
noise_scheduler_type: "diffusers.schedulers.DDPMScheduler"
noise_scheduler:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
variance_type: "fixed_small"
clip_sample: false
denoise_scheduler_type: "diffusers.schedulers.DDIMScheduler"
denoise_scheduler:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
clip_sample: false # clip sample to -1~1
set_alpha_to_one: false
steps_offset: 1
loggers:
wandb:
enable: false
project: "CraftsMan"
name: image-to-shape-diffusion+${name}+${tag}
loss:
loss_type: "mse"
lambda_diffusion: 1.
optimizer:
name: AdamW
args:
lr: 2.e-4
betas: [0.9, 0.99]
eps: 1.e-6
scheduler:
name: CosineAnnealingLR
args:
T_max: 5000
eta_min: 1e-6
trainer:
num_nodes: 1
max_epochs: 100000
log_every_n_steps: 5
num_sanity_val_steps: 1
check_val_every_n_epoch: 25
enable_progress_bar: true
precision: 16-mixed
strategy: 'ddp_find_unused_parameters_true'
checkpoint:
save_last: true
save_top_k: -1
every_n_train_steps: 5000