Spaces:

soonyau
/

visconet

Sleeping

App Files Files Community

soonyau commited on Feb 5, 2024

Commit

664539f

1 Parent(s): 43557dc

fix

Browse files

Files changed (6) hide show

configs/mm_local_style.yaml +0 -147
configs/pose_transfer.yaml +0 -175
configs/visconet_v15.yaml +0 -145
configs/visconet_v1_1.yaml +0 -168
configs/visconet_v21.yaml +0 -149
requirements.txt +2 -2

configs/mm_local_style.yaml DELETED Viewed

@@ -1,147 +0,0 @@
-model:
-  target: visconet.visconet.ViscoNetLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    control_crossattn_key: "styles"
-    mask_key: "human_mask"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-    control_cond_config:
-      target: visconet.modules.ProjectLocalStyle
-       #target: visconet.modules.ClipImageEncoder
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
-  style_embedding_config:
-    target: scripts.image_emb_hidden.ClipImageEncoder
-dataset:
-  train:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_512
-      style_dir: styles
-      style_postfix: _hidden
-      mask_dir: mask_512_padded
-      data_files:
-      - data/deepfashion/mm-train-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      style_emb_shape:
-        - 257
-        - 1024
-      style_names:
-        - background
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories
-  val:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_512
-      style_dir: styles
-      style_postfix: _hidden
-      mask_dir: mask_512_padded
-      data_files:
-      - data/deepfashion/mm-test-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      #sample_ratio: 0.1
-      style_emb_shape:
-        - 257
-        - 1024
-      style_names:
-        - background
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories

configs/pose_transfer.yaml DELETED Viewed

@@ -1,175 +0,0 @@
-model:
-  target: visconet.visconet.ViscoNetLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    control_crossattn_key: "styles"
-    mask_key: "human_mask"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-    control_cond_config:
-      target: visconet.modules.ProjectLocalStyle
-      #target: visconet.modules.ClipImageEncoder
-      # params:
-      #    context_dim: 1024
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
-  style_embedding_config:
-    target: scripts.image_emb_hidden.ClipImageEncoder
-dataset:
-  train:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_hand_512
-      style_dir: styles
-      style_postfix: _hidden
-      mask_dir: smpl_256
-      data_files:
-         - data/deepfashion/pairs-train-all.csv
-         - data/deepfashion/solo-train-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      style_emb_shape:
-        - 257
-        - 1024
-      style_names:
-        - background
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories
-  val:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_hand_512
-      style_dir: styles
-      style_postfix: _hidden
-      mask_dir: smpl_256
-      data_files:
-         - data/deepfashion/solo-test-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      sample_ratio: 0.1
-      style_emb_shape:
-        - 257
-        - 1024
-      style_names:
-        - background
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories
-  test:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_hand_512
-      style_dir: styles
-      style_postfix: _hidden
-      mask_dir: smpl_256
-      data_files:
-         - data/deepfashion/pairs-test-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      style_emb_shape:
-        - 257
-        - 1024
-      style_names:
-        - background
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories

configs/visconet_v15.yaml DELETED Viewed

@@ -1,145 +0,0 @@
-model:
-  target: visconet.visconet.ViscoNetLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    control_crossattn_key: "styles"
-    mask_key: "human_mask"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-    control_cond_config:
-      target: visconet.modules.LinearProj
-       #target: visconet.modules.ClipImageEncoder
-      params:
-         context_dim: 768
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
-  style_embedding_config:
-    target: scripts.image_emb.ClipImageEncoder
-    params:
-      context_dim: 768
-dataset:
-  train:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_512
-      style_dir: styles
-      mask_dir: mask_512_padded
-      data_files:
-      - data/deepfashion/mm-train-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      style_emb_shape:
-        - 1
-        - 768 #control_stage_config.params.context_dim
-      style_names:
-        - face
-        - hair
-        - headwear
-        - background
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories
-  val:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_512
-      style_dir: styles
-      mask_dir: mask_512_padded
-      data_files:
-      - data/deepfashion/mm-test-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      sample_ratio: 0.1
-      style_emb_shape:
-        - 1
-        - 768 #control_stage_config.params.context_dim
-      style_names:
-        - face
-        - hair
-        - headwear
-        - background
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories

configs/visconet_v1_1.yaml DELETED Viewed

@@ -1,168 +0,0 @@
-model:
-  target: visconet.visconet.ViscoNetLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    control_crossattn_key: "styles"
-    mask_key: "human_mask"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-    scheduler_config:
-      target: torch.optim.lr_scheduler.ReduceLROnPlateau
-      monitor: val/loss_simple_ema
-      params:
-        mode: min
-        factor: 0.5
-        patience: 3
-        cooldown: 0
-        min_lr: 0.00001
-        threshold: 0.001
-        verbose: True
-    control_cond_config:
-      target: visconet.modules.ProjectLocalStyle
-      params:
-        pool_size: 9
-        local_emb_size: 257
-        bias: True
-       #target: visconet.modules.ClipImageEncoder
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
-  style_embedding_config:
-    target: scripts.image_emb_hidden.ClipImageEncoder
-dataset:
-  train:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_hand_default_512
-      style_dir: styles_default
-      style_postfix: _hidden
-      mask_dir: smpl_256
-      data_files:
-         - data/deepfashion/pairs-train-all.csv
-         - data/deepfashion/solo-train-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      style_emb_shape:
-        - 257
-        - 1024
-      crop_shape:
-        - 512
-        - 384
-      style_names:
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories
-  val:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_hand_default_512
-      style_dir: styles_default
-      style_postfix: _hidden
-      mask_dir: smpl_256
-      data_files:
-      - data/deepfashion/pairs-test-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      sample_ratio: 1.0
-      style_emb_shape:
-        - 257
-        - 1024
-      crop_shape:
-        - 512
-        - 384
-      style_names:
-        - face
-        - hair
-        - headwear
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories

configs/visconet_v21.yaml DELETED Viewed

@@ -1,149 +0,0 @@
-model:
-  target: visconet.visconet.ViscoNetLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    control_crossattn_key: "styles"
-    mask_key: "human_mask"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-    control_cond_config:
-      target: visconet.modules.LinearProj
-       #target: visconet.modules.ClipImageEncoder
-      params:
-         context_dim: 1024
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
-  style_embedding_config:
-    target: scripts.image_emb.ClipImageEncoder
-    params:
-      context_dim: 768
-dataset:
-  train:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_512
-      style_dir: styles
-      mask_dir: mask_512_padded
-      data_files:
-      - data/deepfashion/mm-train-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      style_emb_shape:
-        - 1
-        - 768 #control_stage_config.params.context_dim
-      style_names:
-        - face
-        - hair
-        - headwear
-        - background
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories
-  val:
-    target: visconet.deepfashion.DeepFashionDataset
-    params:
-      image_root: "/home/soon/datasets/deepfashion_inshop"
-      image_dir: img_512_padded
-      pose_dir: openpose_512
-      style_dir: styles
-      mask_dir: mask_512_padded
-      data_files:
-      - data/deepfashion/mm-test-all.csv
-      map_file: data/deepfashion/deepfashion_map.csv
-      sample_ratio: 0.1
-      style_emb_shape:
-        - 1
-        - 768 #control_stage_config.params.context_dim
-      style_names:
-        - face
-        - hair
-        - headwear
-        - background
-        - top
-        - outer
-        - bottom
-        - shoes
-        - accesories

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ pip==20.3
 torch==1.12.1
 torchvision==0.13.1
 numpy==1.23.1
 gradio==3.39.0
 albumentations==1.3.0
 imageio==2.9.0
@@ -26,5 +27,4 @@ basicsr==1.4.2
 xformers==0.0.13
 deepface
 #opencv-contrib-python
---extra-index-url https://download.pytorch.org/whl/cu113
-torch

 torch==1.12.1
 torchvision==0.13.1
 numpy==1.23.1
+git+https://github.com/openai/CLIP.git
 gradio==3.39.0
 albumentations==1.3.0
 imageio==2.9.0
 xformers==0.0.13
 deepface
 #opencv-contrib-python
+--extra-index-url https://download.pytorch.org/whl/cu113