|
datamodule: |
|
transforms: |
|
combine_goal_obs: false |
|
move_axis: false |
|
bytes_to_string: true |
|
adjust_type: null |
|
add_robot_information: false |
|
language_encoders: |
|
_target_: medit.agents.input_encoders.goal_encoders.language_encoders.clip_tokens.TokenLangClip |
|
_recursive_: false |
|
model_name: ${clip_lang_model_name} |
|
_target_: oxe_torch_dataloader.uha.uha_datamodule.UhaDataModule |
|
_recursive_: false |
|
num_workers: ${num_workers} |
|
batch_size: ${batch_size} |
|
pin_memory: ${pin_memory} |
|
drop_last: ${drop_last} |
|
datasets: |
|
DATA_NAME: ${DATA_NAME} |
|
DATA_PATH: gs://gresearch/robotics |
|
load_camera_views: ${load_camera_views} |
|
dataset_size_limit: ${dataset_size_limit} |
|
action_proprio_normalization_type: bounds |
|
interleaved_dataset_cfg: |
|
shuffle_buffer_size: ${shuffle_buffer_size} |
|
balance_weights: true |
|
traj_transform_kwargs: |
|
goal_relabeling_strategy: ${goal_relabeling_strategy} |
|
goal_relabeling_kwargs: ${goal_relabeling_kwargs} |
|
window_size: ${window_size} |
|
action_horizon: ${act_seq_len} |
|
subsample_length: ${subsample_length} |
|
skip_unlabeled: ${skip_unlabeled} |
|
frame_transform_kwargs: |
|
image_augment_kwargs: |
|
primary: |
|
random_resized_crop: |
|
scale: |
|
- 0.8 |
|
- 1.0 |
|
ratio: |
|
- 0.9 |
|
- 1.1 |
|
random_brightness: |
|
- 0.1 |
|
random_contrast: |
|
- 0.9 |
|
- 1.1 |
|
random_saturation: |
|
- 0.9 |
|
- 1.1 |
|
random_hue: |
|
- 0.05 |
|
augment_order: |
|
- random_resized_crop |
|
- random_brightness |
|
- random_contrast |
|
- random_saturation |
|
- random_hue |
|
secondary: |
|
random_resized_crop: |
|
scale: |
|
- 0.8 |
|
- 1.0 |
|
ratio: |
|
- 0.9 |
|
- 1.1 |
|
random_brightness: |
|
- 0.1 |
|
random_contrast: |
|
- 0.9 |
|
- 1.1 |
|
random_saturation: |
|
- 0.9 |
|
- 1.1 |
|
random_hue: |
|
- 0.05 |
|
augment_order: |
|
- random_resized_crop |
|
- random_brightness |
|
- random_contrast |
|
- random_saturation |
|
- random_hue |
|
wrist: |
|
random_brightness: |
|
- 0.1 |
|
random_contrast: |
|
- 0.9 |
|
- 1.1 |
|
random_saturation: |
|
- 0.9 |
|
- 1.1 |
|
random_hue: |
|
- 0.05 |
|
augment_order: |
|
- random_brightness |
|
- random_contrast |
|
- random_saturation |
|
- random_hue |
|
resize_size: |
|
primary: |
|
- 224 |
|
- 224 |
|
secondary: |
|
- 224 |
|
- 224 |
|
wrist: |
|
- 224 |
|
- 224 |
|
resize_size_future_obs: |
|
primary: |
|
- 112 |
|
- 112 |
|
secondary: |
|
- 112 |
|
- 112 |
|
wrist: |
|
- 112 |
|
- 112 |
|
num_parallel_calls: 128 |
|
traj_transform_threads: 64 |
|
traj_read_threads: 32 |
|
trainer: |
|
agent: |
|
agent: |
|
language_goal: |
|
_target_: medit.agents.input_encoders.goal_encoders.language_encoders.clip_tokens.LangClip |
|
_recursive_: false |
|
freeze_backbone: true |
|
model_name: ${clip_lang_model_name} |
|
model: |
|
_target_: medit.agents.inner_models.edm_diffusion_policy.score_wrappers.GCDenoiser |
|
_recursive_: true |
|
sigma_data: 0.5 |
|
inner_model: |
|
_target_: medit.agents.inner_models.modedit.MoDeDiT |
|
action_dim: ${act_dim} |
|
goal_dim: ${goal_dim} |
|
obs_dim: 2048 |
|
goal_conditioned: true |
|
causal: true |
|
use_custom_attn_mask: false |
|
use_proprio: false |
|
state_dim: 8 |
|
embed_dim: 1024 |
|
n_layers: 12 |
|
goal_seq_len: 1 |
|
obs_seq_len: ${obs_seq_len} |
|
action_seq_len: ${act_seq_len} |
|
embed_pdrob: 0 |
|
goal_drop: 0.1 |
|
attn_pdrop: 0.3 |
|
mlp_pdrop: 0.1 |
|
n_heads: 8 |
|
linear_output: true |
|
cond_router: true |
|
num_experts: 4 |
|
top_k: 2 |
|
router_normalize: true |
|
use_goal_in_routing: false |
|
use_argmax: false |
|
use_shared_expert: false |
|
use_noise_token_as_input: true |
|
init_style: olmoe |
|
_target_: medit.agents.mode_agent.MoDEAgent |
|
_recursive_: false |
|
latent_dim: 1024 |
|
multistep: 5 |
|
sampler_type: ddim |
|
num_sampling_steps: 5 |
|
sigma_data: 0.5 |
|
sigma_min: 0.001 |
|
sigma_max: 80 |
|
noise_scheduler: exponential |
|
sigma_sample_density_type: loglogistic |
|
act_window_size: ${act_seq_len} |
|
act_dim: ${act_dim} |
|
seed: ${seed} |
|
obs_modalities: ${obs_modalities} |
|
goal_modalities: ${goal_modalities} |
|
img_modalities: ${img_modalities} |
|
lang_modalities: ${lang_modalities} |
|
target_modality: ${target_modality} |
|
entropy_gamma: 0.01 |
|
router_z_delta: 0.0 |
|
resnet_type: '50' |
|
_target_: medit.agents.ddp_wrapper.DDPAgentWrapper |
|
_recursive_: false |
|
obs_modalities: ${obs_modalities} |
|
goal_modalities: ${goal_modalities} |
|
img_modalities: ${img_modalities} |
|
lang_modalities: ${lang_modalities} |
|
target_modality: ${target_modality} |
|
_target_: medit.trainers.accelerate_trainer.AccelerateTrainer |
|
_recursive_: false |
|
weight_decay: |
|
transformer_weight_decay: 0.1 |
|
obs_encoder_weight_decay: 0.1 |
|
perceptual_encoder_lr: 0.0001 |
|
lr_scheduler: ${lr_scheduler} |
|
eval_every_n_steps: ${eval_every_n_steps} |
|
save_every_n_steps: ${save_every_n_steps} |
|
max_train_steps: ${max_train_steps} |
|
max_eval_steps: ${max_eval_steps} |
|
use_ema: true |
|
decay: ${decay} |
|
rampup_ratio: ${rampup_ratio} |
|
update_ema_every_n_steps: ${update_ema_every_n_steps} |
|
batch_size: ${batch_size} |
|
obs_modalities: ${obs_modalities} |
|
goal_modalities: ${goal_modalities} |
|
img_modalities: ${img_modalities} |
|
lang_modalities: ${lang_modalities} |
|
target_modality: ${target_modality} |
|
vis_clip_model_name: ViT-B/16 |
|
clip_lang_model_name: ViT-B/32 |
|
DATA_NAME: MO |
|
wandb: |
|
name: uha_${now:%H-%M-%S} |
|
group: ${now:%Y-%m-%d} |
|
project: simulation_eval |
|
entity: irl-masterthesis |
|
mode: null |
|
lr_scheduler: |
|
_target_: medit.agents.utils.lr_schedulers.InverseSquareRootLRSchedule |
|
num_warmup_steps: 1000 |
|
timescale: ${max_train_steps} |
|
log_dir: logs/ |
|
window_size: 1 |
|
obs_seq_len: 1 |
|
goal_window_size: 1 |
|
seed: 42 |
|
obs_dim: 512 |
|
goal_dim: 512 |
|
act_seq_len: 10 |
|
update_ema_every_n_steps: 1 |
|
decay: 0.999 |
|
rampup_ratio: 0.001 |
|
gen_img_res: 112 |
|
num_tokens_voltron: 10 |
|
img_gen_frame_diff: 3 |
|
use_modality_encoder: false |
|
goal_relabeling_strategy: null |
|
goal_relabeling_kwargs: |
|
min_bound: 20 |
|
max_bound: 50 |
|
frame_diff: ${img_gen_frame_diff} |
|
subsample_length: null |
|
skip_unlabeled: true |
|
load_camera_views: |
|
- primary |
|
- secondary |
|
- wrist |
|
obs_modalities: observation |
|
goal_modalities: task |
|
img_modalities: |
|
- image_primary |
|
- image_secondary |
|
- image_wrist |
|
lang_modalities: |
|
- language_instruction |
|
target_modality: action |
|
drop_last: true |
|
pin_memory: true |
|
num_workers: 0 |
|
gradient_accumulation_steps: 1 |
|
act_dim: 7 |
|
max_train_steps: 300000 |
|
max_eval_steps: 200 |
|
eval_every_n_steps: 5000 |
|
save_every_n_steps: 5000 |
|
shuffle_buffer_size: 400000 |
|
batch_size: 512 |
|
dataset_size_limit: null |
|
|