ModelMan / configs /image-to-shape-diffusion /clip-mvrgb-modln-l256-e64-ne8-nd16-nl6.yaml
wyysf's picture
i
0f079b2
exp_root_dir: "outputs"
name: "image-to-shape-diffusion/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6"
tag: "${rmspace:${system.shape_model_type}+n${data.n_samples}+noise${data.noise_sigma}+pfeat${system.shape_model.point_feats}+normemb${system.condition_model.normalize_embeds}+lr${system.optimizer.args.lr}+qkvbias${system.shape_model.qkv_bias}+nfreq${system.shape_model.num_freqs}+ln_post${system.shape_model.use_ln_post},_}"
seed: 0
data_type: "objaverse-datamodule"
data:
root_dir: "data/objaverse_clean/cap3d_high_quality_170k_images"
data_type: "occupancy"
n_samples: 4096
noise_sigma: 0.
load_supervision: False
supervision_type: "occupancy"
n_supervision: 4096
load_image: True # whether to load images
image_data_path: data/objaverse_clean/raw_data/images/cap3d_high_quality_170k
image_type: "mvrgb" # rgb, normal, mvrgb, mvnormal
idx: [0, 4, 8, 12, 16]
n_views: 4
load_caption: False # whether to load captions
rotate_points: False
batch_size: 32
num_workers: 16
system_type: "shape-diffusion-system"
system:
val_samples_json: "val_data/mv_images/val_samples_rgb_mvimage.json"
z_scale_factor: 1.0
guidance_scale: 7.5
num_inference_steps: 50
eta: 0.0
shape_model_type: "michelangelo-autoencoder"
shape_model:
# pretrained_model_name_or_path: ./ckpts/3DNativeGeneration/michelangelo-aligned-autoencoder-l256-e64-ne8-nd16.ckpt
pretrained_model_name_or_path: "./outputs/image-to-shape-diffusion_bak/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6/michelangelo-autoencoder+n4096+noise0.0+pfeat3+normembFalse+lr5e-05+qkvbiasFalse+nfreq8+ln_postTrue/ckpts/last.ckpt"
num_latents: 256
embed_dim: 64
point_feats: 3 # xyz + normal
out_dim: 1 # only occupancy
num_freqs: 8
include_pi: false
heads: 12
width: 768
num_encoder_layers: 8
num_decoder_layers: 16
use_ln_post: true
init_scale: 0.25
qkv_bias: false
use_flash: true
use_checkpoint: true
condition_model_type: "clip-embedder"
condition_model:
pretrained_model_name_or_path: "./ckpts/pretrained_weights/huggingface/hub/models--openai--clip-vit-large-patch14/snapshots/8d052a0f05efbaefbc9e8786ba291cfdf93e5bff"
encode_camera: true
camera_embeds_dim: 32 # 16 * 2[sin, cos]
n_views: ${data.n_views}
empty_embeds_ratio: 0.1
normalize_embeds: false
# zero_uncond_embeds: true
zero_uncond_embeds: false
denoiser_model_type: "simple-denoiser"
denoiser_model:
# pretrained_model_name_or_path: "./ckpts/CraftsMan/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6.pth"
pretrained_model_name_or_path: "./ckpts/CraftsMan/clip-mvrgb-modln-l256-e64-ne8-nd16-nl6-It500000.pth"
input_channels: ${system.shape_model.embed_dim}
output_channels: ${system.shape_model.embed_dim}
n_ctx: ${system.shape_model.num_latents}
width: 768
layers: 6 # 2 * 6 + 1 = 13
heads: 12
context_dim: 1024
init_scale: 1.0
skip_ln: true
use_checkpoint: true
noise_scheduler_type: "diffusers.schedulers.DDPMScheduler"
noise_scheduler:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
variance_type: "fixed_small"
clip_sample: false
denoise_scheduler_type: "diffusers.schedulers.DDIMScheduler"
denoise_scheduler:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
clip_sample: false # clip sample to -1~1
set_alpha_to_one: false
steps_offset: 1
loggers:
wandb:
enable: false
project: "CraftsMan"
name: image-to-shape-diffusion+${name}+${tag}
loss:
loss_type: "mse"
lambda_diffusion: 1.
optimizer:
name: AdamW
args:
lr: 5.e-5
betas: [0.9, 0.99]
eps: 1.e-6
scheduler:
name: SequentialLR
interval: step
schedulers:
- name: LinearLR
interval: step
args:
start_factor: 1e-6
end_factor: 1.0
total_iters: 5000
- name: CosineAnnealingLR
interval: step
args:
T_max: 5000
eta_min: 0.
milestones: [5000]
trainer:
num_nodes: 1
max_epochs: 100000
log_every_n_steps: 5
num_sanity_val_steps: 1
check_val_every_n_epoch: 3
enable_progress_bar: true
precision: 16-mixed
strategy: 'ddp_find_unused_parameters_true'
checkpoint:
save_last: true
save_top_k: -1
every_n_train_steps: 5000