Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,854 Bytes
8133633 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
exp_root_dir: "outputs"
name: "image-to-shape-diffusion/clip-dino-rgb-pixart-lr2e4-ddim"
tag: "${rmspace:${system.shape_model_type}+n${data.n_samples}+pfeat${system.shape_model.point_feats}+lr${system.optimizer.args.lr},_}"
seed: 0
data_type: "objaverse-datamodule"
data:
root_dir: ./data/objaverse
data_type: "sdf"
sampling_strategy: random
n_samples: 10240
load_supervision: False
supervision_type: ""
n_supervision: 0
load_image: True # whether to load images
image_data_path: ./data/objaverse/render+blender+singleview+nv20
image_type: "rgb" # rgb, normal
idx: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] # front view
n_views: 1
background_color: [0.5, 0.5, 0.5]
marign_pix_dis: 30
batch_size: 40
num_workers: 16
system_type: "pixart-diffusion-system"
system:
val_samples_json: "val_data/images/val_samples_rgb_image.json"
z_scale_factor: 1.0
guidance_scale: 7.5
num_inference_steps: 50
eta: 0.0
extract_mesh_func: diffdmc
shape_model_type: michelangelo-autoencoder
shape_model:
pretrained_model_name_or_path: /mnt/cfs/public/native3D/ckpts/michelangelo-autoencoder-l256-e64-ne8-nd16-scaleup.ckpt
use_downsample: true
downsample_ratio: 0.0625
num_latents: 768
use_multi_reso: false
resolutions: [4096, 8192, 12288]
sampling_prob: [0, 0, 1]
embed_dim: 64
point_feats: 3
out_dim: 1
num_freqs: 8
include_pi: false
heads: 12
width: 768
num_encoder_layers: 8
num_decoder_layers: 16
use_ln_post: true
init_scale: 0.25
qkv_bias: false
use_flash: true
use_checkpoint: true
condition_model_type: "cond-embedder"
condition_model:
pretrained_clip_name_or_path: openai/clip-vit-large-patch14
pretrained_dino_name_or_path: facebook/dinov2-base
pretrained_tokenizer_name_or_path: openai/clip-vit-large-patch14
freeze_modulation_clip: true
freeze_modulation_dino: true
encode_camera: false
camera_embeds_dim: 0
n_views: ${data.n_views}
empty_embeds_ratio: 0.1
normalize_embeds: false
zero_uncond_embeds: true
linear_proj_init: constant
image_size_dino: 224
image_size_clip: 224
denoiser_model_type: "pixart-denoiser"
denoiser_model:
input_channels: ${system.shape_model.embed_dim}
output_channels: ${system.shape_model.embed_dim}
n_ctx: ${system.shape_model.num_latents}
width: 768
layers: 32
heads: 12
context_dim: 1024
init_scale: 1.0
skip_ln: true
variance_type: ${system.noise_scheduler.variance_type}
use_checkpoint: true
dit_block: DiTBlock
noise_scheduler_type: "diffusers.schedulers.DDPMScheduler"
noise_scheduler:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
variance_type: "fixed_small"
clip_sample: false
denoise_scheduler_type: "diffusers.schedulers.DDIMScheduler"
denoise_scheduler:
num_train_timesteps: 1000
beta_start: 0.00085
beta_end: 0.012
beta_schedule: "scaled_linear"
clip_sample: false # clip sample to -1~1
set_alpha_to_one: false
steps_offset: 1
loggers:
wandb:
enable: false
project: "CraftsMan"
name: image-to-shape-diffusion+${name}+${tag}
loss:
loss_type: "mse"
lambda_diffusion: 1.
optimizer:
name: AdamW
args:
lr: 2.e-4
betas: [0.9, 0.99]
eps: 1.e-6
scheduler:
name: CosineAnnealingLR
args:
T_max: 5000
eta_min: 1e-6
trainer:
num_nodes: 1
max_epochs: 100000
log_every_n_steps: 5
num_sanity_val_steps: 1
check_val_every_n_epoch: 25
enable_progress_bar: true
precision: 16-mixed
strategy: 'ddp_find_unused_parameters_true'
checkpoint:
save_last: true
save_top_k: -1
every_n_train_steps: 5000 |