azizinaghsh
commited on
Commit
Β·
293829f
1
Parent(s):
8d93e3b
add ccd
Browse filesfixed path
update path
update path
add CCD to path
update path
fixed ccd log function
fixed checkpoint path
fixed config.yaml
delete app.py
This view is limited to 50 files because it contains too many changes. Β
See raw diff
- .DS_Store +0 -0
- .gitignore +1 -0
- CCD/.DS_Store +0 -0
- CCD/checkpoints/Mean_Std.npy +3 -0
- CCD/checkpoints/latest.pth +3 -0
- CCD/src/.DS_Store +0 -0
- CCD/src/LSTM.py +202 -0
- CCD/src/README.md +51 -0
- CCD/src/classify.py +368 -0
- CCD/src/main.py +389 -0
- CCD/src/metric.py +178 -0
- CCD/utils/rerun.py +102 -0
- {checkpoints β ET/checkpoints}/ca-mixed-e449.ckpt +0 -0
- {configs β ET/configs}/compnode/cpu.yaml +0 -0
- {configs β ET/configs}/compnode/gpu.yaml +0 -0
- {configs β ET/configs}/config.yaml +2 -2
- {configs β ET/configs}/dataset/caption/caption.yaml +0 -0
- {configs β ET/configs}/dataset/char/char.yaml +0 -0
- {configs β ET/configs}/dataset/standardization/0300.yaml +0 -0
- {configs β ET/configs}/dataset/traj+caption+char.yaml +0 -0
- {configs β ET/configs}/dataset/trajectory/rot6d_trajectory.yaml +0 -0
- {configs β ET/configs}/diffuser/network/module/ca_director.yaml +0 -0
- {configs β ET/configs}/diffuser/network/rn_director.yaml +0 -0
- {configs β ET/configs}/diffuser/rn_director_edm.yaml +0 -0
- {data β ET/data}/cam_segments/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/cam_segments/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/cam_segments/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/caption/2011_F_EuMeT2wBo_00014_00001.txt +0 -0
- {data β ET/data}/caption/2011_KAeAqaA0Llg_00005_00001.txt +0 -0
- {data β ET/data}/caption/2011_MCkKihQrNA4_00014_00000.txt +0 -0
- {data β ET/data}/caption_clip/seq/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/caption_clip/seq/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/caption_clip/seq/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/caption_clip/token/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/caption_clip/token/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/caption_clip/token/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/char/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/char/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/char/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/char_raw/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/char_raw/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/char_raw/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/char_segments/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/char_segments/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/char_segments/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/demo_split.txt +0 -0
- {data β ET/data}/intrinsics/2011_F_EuMeT2wBo_00014_00001.npy +0 -0
- {data β ET/data}/intrinsics/2011_KAeAqaA0Llg_00005_00001.npy +0 -0
- {data β ET/data}/intrinsics/2011_MCkKihQrNA4_00014_00000.npy +0 -0
- {data β ET/data}/traj/2011_F_EuMeT2wBo_00014_00001.txt +0 -0
.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.DS_Store
|
CCD/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
CCD/checkpoints/Mean_Std.npy
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:332dacb3ed6e6862c11b0b5f33a469ad3b715e15b7100219408188726ebb3ce7
|
3 |
+
size 502
|
CCD/checkpoints/latest.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bd4c2981c5ba8807797c38c649da0111a0bc9fb4846f9d24fe8ca459a2fefc0a
|
3 |
+
size 18868379
|
CCD/src/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
CCD/src/LSTM.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This script does conditional image generation on MNIST, using a diffusion model
|
3 |
+
|
4 |
+
This code is modified from,
|
5 |
+
https://github.com/cloneofsimo/minDiffusion
|
6 |
+
|
7 |
+
Diffusion model is based on DDPM,
|
8 |
+
https://arxiv.org/abs/2006.11239
|
9 |
+
|
10 |
+
The conditioning idea is taken from 'Classifier-Free Diffusion Guidance',
|
11 |
+
https://arxiv.org/abs/2207.12598
|
12 |
+
|
13 |
+
This technique also features in ImageGen 'Photorealistic Text-to-Image Diffusion Modelswith Deep Language Understanding',
|
14 |
+
https://arxiv.org/abs/2205.11487
|
15 |
+
|
16 |
+
'''
|
17 |
+
|
18 |
+
from typing import Dict, Tuple
|
19 |
+
from tqdm import tqdm
|
20 |
+
import torch
|
21 |
+
import torch.nn as nn
|
22 |
+
import torch.nn.functional as F
|
23 |
+
from torch.utils.data import DataLoader
|
24 |
+
from torchvision import models, transforms
|
25 |
+
from torchvision.datasets import MNIST
|
26 |
+
from torchvision.utils import save_image, make_grid
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
from matplotlib.animation import FuncAnimation, PillowWriter
|
29 |
+
import numpy as np
|
30 |
+
import os
|
31 |
+
import clip
|
32 |
+
|
33 |
+
class LSTM(nn.Module):
|
34 |
+
def __init__(self, input_size, hidden_size, output_size, embed_size=512, n_layer=1, bidirectional=False):
|
35 |
+
super(LSTM, self).__init__()
|
36 |
+
self.n_layer = n_layer
|
37 |
+
self.bidirectional = bidirectional
|
38 |
+
self.hidden_size = hidden_size
|
39 |
+
|
40 |
+
self.num_directions = 2 if bidirectional else 1
|
41 |
+
self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers=n_layer, batch_first=True, bidirectional=bidirectional)
|
42 |
+
|
43 |
+
self.encoder = nn.Sequential(nn.Linear(embed_size, hidden_size))
|
44 |
+
|
45 |
+
self.decoder = nn.Sequential(nn.Linear(hidden_size, output_size))
|
46 |
+
|
47 |
+
self.embed = nn.Sequential(nn.Linear(embed_size, embed_size))
|
48 |
+
|
49 |
+
|
50 |
+
def initHidden(self, batch_size=1):
|
51 |
+
h0 = torch.zeros(self.n_layer, batch_size, self.hidden_size, requires_grad=False).cuda()
|
52 |
+
c0 = torch.zeros(self.n_layer, batch_size, self.hidden_size, requires_grad=False).cuda()
|
53 |
+
return (h0, c0)
|
54 |
+
|
55 |
+
def forward(self, input, embed):
|
56 |
+
bs, length, n_feat = input.shape
|
57 |
+
|
58 |
+
embed = self.embed(embed).unsqueeze(1).repeat(1, length, 1)
|
59 |
+
|
60 |
+
hidden = self.initHidden(bs)
|
61 |
+
output, hidden = self.lstm(embed, hidden)
|
62 |
+
|
63 |
+
return self.decoder(output)
|
64 |
+
|
65 |
+
import torch.utils.data as data
|
66 |
+
class camdataset(data.Dataset):
|
67 |
+
def __init__(self, data, label):
|
68 |
+
self.data = data
|
69 |
+
self.label = label
|
70 |
+
|
71 |
+
def __getitem__(self, index):
|
72 |
+
text = np.random.choice(self.label[index], np.random.randint(1, len(self.label[index])+1), replace=False)
|
73 |
+
|
74 |
+
d = self.data[index]
|
75 |
+
d = np.concatenate((d, d[-1:].repeat(300-len(d), 0)), 0)
|
76 |
+
|
77 |
+
return np.array(d, dtype="float32"), " ".join(text)
|
78 |
+
|
79 |
+
def __len__(self):
|
80 |
+
return len(self.data)
|
81 |
+
|
82 |
+
|
83 |
+
def train():
|
84 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
85 |
+
|
86 |
+
d = np.concatenate(data["cam"], 0)
|
87 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
88 |
+
|
89 |
+
for i in range(len(data["cam"])):
|
90 |
+
data["cam"][i] = (data["cam"][i] - Mean[None, :]) / (Std[None, :] + 1e-8)
|
91 |
+
|
92 |
+
# hardcoding these here
|
93 |
+
n_epoch = 1000
|
94 |
+
batch_size = 128
|
95 |
+
device = "cuda:0"
|
96 |
+
n_feature = 5
|
97 |
+
lrate = 1e-4
|
98 |
+
save_model = True
|
99 |
+
save_dir = './result/'
|
100 |
+
if not os.path.exists(save_dir):
|
101 |
+
os.mkdir(save_dir)
|
102 |
+
|
103 |
+
criterion = torch.nn.MSELoss()
|
104 |
+
trans = LSTM(input_size=n_feature, hidden_size=512, output_size=n_feature)
|
105 |
+
trans.to(device)
|
106 |
+
|
107 |
+
optim = torch.optim.Adam(trans.parameters(), lr=lrate)
|
108 |
+
|
109 |
+
dataloader = DataLoader(camdataset(data['cam'], data['info']), batch_size=batch_size, shuffle=True, num_workers=5)
|
110 |
+
|
111 |
+
if not os.path.exists("result"):
|
112 |
+
os.mkdir("result")
|
113 |
+
|
114 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
115 |
+
model, preprocess = clip.load("ViT-B/32", device=device)
|
116 |
+
|
117 |
+
for ep in range(n_epoch):
|
118 |
+
print(f'epoch {ep}')
|
119 |
+
trans.train()
|
120 |
+
|
121 |
+
# linear lrate decay
|
122 |
+
optim.param_groups[0]['lr'] = lrate * (1 - ep / n_epoch)
|
123 |
+
|
124 |
+
pbar = tqdm(dataloader)
|
125 |
+
loss_ema = None
|
126 |
+
for x, c in pbar:
|
127 |
+
optim.zero_grad()
|
128 |
+
x = x.to(device)
|
129 |
+
with torch.no_grad():
|
130 |
+
c = clip.tokenize(c, truncate=True).to(device)
|
131 |
+
c = model.encode_text(c).float().detach()
|
132 |
+
|
133 |
+
loss = criterion(trans(x, c), x)
|
134 |
+
loss.backward()
|
135 |
+
if loss_ema is None:
|
136 |
+
loss_ema = loss.item()
|
137 |
+
else:
|
138 |
+
loss_ema = 0.95 * loss_ema + 0.05 * loss.item()
|
139 |
+
pbar.set_description(f"loss: {loss_ema:.4f}")
|
140 |
+
optim.step()
|
141 |
+
|
142 |
+
torch.save(trans.state_dict(), save_dir + f"latest.pth")
|
143 |
+
if save_model and ep % 100 == 0:
|
144 |
+
|
145 |
+
torch.save(trans.state_dict(), save_dir + f"model_{ep}.pth")
|
146 |
+
print('saved model at ' + save_dir + f"model_{ep}.pth")
|
147 |
+
|
148 |
+
def eval():
|
149 |
+
if not os.path.exists("Mean_Std.npy"):
|
150 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
151 |
+
|
152 |
+
d = np.concatenate(data["cam"], 0)
|
153 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
154 |
+
np.save("Mean_Std", {"Mean": Mean, "Std": Std})
|
155 |
+
d = np.load("Mean_Std.npy", allow_pickle=True)[()]
|
156 |
+
Mean, Std = d["Mean"], d["Std"]
|
157 |
+
|
158 |
+
device = "cuda:0"
|
159 |
+
n_feature = 5
|
160 |
+
|
161 |
+
trans = LSTM(input_size=n_feature, hidden_size=512, output_size=n_feature)
|
162 |
+
trans.to(device)
|
163 |
+
|
164 |
+
# optionally load a model
|
165 |
+
trans.load_state_dict(torch.load("./result/latest.pth"))
|
166 |
+
|
167 |
+
if not os.path.exists("viz"):
|
168 |
+
os.mkdir("viz")
|
169 |
+
|
170 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
171 |
+
model, preprocess = clip.load("ViT-B/32", device=device)
|
172 |
+
|
173 |
+
d = np.load("test_prompt.npy", allow_pickle=True)[()]
|
174 |
+
|
175 |
+
result = []
|
176 |
+
for i in tqdm(range(0, len(d['info']), 100)):
|
177 |
+
txt = d['info'][i:i + 100]
|
178 |
+
text = [" ".join(v) for v in txt]
|
179 |
+
|
180 |
+
with torch.no_grad():
|
181 |
+
c = clip.tokenize(text, truncate=True).to(device)
|
182 |
+
c = model.encode_text(c).float().detach()
|
183 |
+
|
184 |
+
sample = trans(torch.zeros(len(c), 300, n_feature), c)
|
185 |
+
sample = sample.detach().cpu().numpy()
|
186 |
+
|
187 |
+
for j in range(len(text)):
|
188 |
+
s = sample[j] * Std[None, :] + Mean[None, :]
|
189 |
+
result.append(s)
|
190 |
+
|
191 |
+
np.save("LSTM_test", {"result": result, "label": d["label"]})
|
192 |
+
|
193 |
+
if __name__ == "__main__":
|
194 |
+
import sys
|
195 |
+
mode = sys.argv[1]
|
196 |
+
|
197 |
+
if mode == 'train':
|
198 |
+
train()
|
199 |
+
elif mode == 'eval':
|
200 |
+
eval()
|
201 |
+
else:
|
202 |
+
print('Error, instruction {} is not in {train, eval}')
|
CCD/src/README.md
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Cinematographic Camera Diffusion Model
|
2 |
+
|
3 |
+
This repo provides PyTorch implementation of our paper :
|
4 |
+
|
5 |
+
*Cinematographic Camera Diffusion Model*
|
6 |
+
|
7 |
+
[Hongda Jiang](https://jianghd1996.github.io/), [Xi Wang](https://triocrossing.github.io/), [Marc Christie](http://people.irisa.fr/Marc.Christie/), [Libin Liu](http://libliu.info/),[Baoquan Chen](https://baoquanchen.info/)
|
8 |
+
|
9 |
+
Eurographics 2024
|
10 |
+
|
11 |
+
The homepage and paper will release after published.
|
12 |
+
|
13 |
+
## Prerequisites
|
14 |
+
|
15 |
+
The environment requirement for this repo is simple.
|
16 |
+
|
17 |
+
- Linux
|
18 |
+
- NVIDIA GPU + CUDA CuDNN
|
19 |
+
- Python 3.8
|
20 |
+
- Pytorch, torchvision, tqdm, matplotlib, numpy, [CLIP](https://github.com/openai/CLIP)
|
21 |
+
|
22 |
+
## Dataset
|
23 |
+
|
24 |
+
We provide dataset in [link](https://drive.google.com/file/d/1VxmGy9szWShOKzWvIxrmgaNEkeqGPLJU/view?usp=sharing). The dataset is a numpy dict, where the key 'cam' includes the camera trajectories and 'info' includes the text descriptions.
|
25 |
+
|
26 |
+
## Pretrained Model
|
27 |
+
|
28 |
+
We provide [weights](https://drive.google.com/file/d/136IZeL4PSf9L6FJ4n_jFM6QFLTDjbvr1/view?usp=sharing) with text only training results. Please create an empty folder `weight` and put the weight file into the folder.
|
29 |
+
|
30 |
+
Tips:
|
31 |
+
If you want to use the pretrained weight, please use *zooms in* and *zooms out* when you want to generate sequence with *pushes in* and *pulls out* since in the training we use these two prompts.
|
32 |
+
|
33 |
+
## Inference
|
34 |
+
|
35 |
+
Simply run ```python main.py gen``` and the generated sequences will be put in folder `gen`.
|
36 |
+
|
37 |
+
We provide a Unity Scene for visualize the result [link](https://drive.google.com/file/d/1zAOJ8zN2hYO-dlQJSNl5uR_JtKapjpM8/view?usp=sharing), the version of the project is 2018.2.13f1. You need to set the file path, shooting target (head), shooting character. Here we provide an example of 'pan' motion with prompt 'The camera pans to the character. The camera switches from right front view to right back view. The character is at the middle center of the screen. The camera shoots at close shot.'.
|
38 |
+
|
39 |
+
![camera_parameter](image/Unity_script.png)
|
40 |
+
|
41 |
+
![camera_run](image/unity.gif)
|
42 |
+
|
43 |
+
## Evaluation
|
44 |
+
|
45 |
+
We provide the code of classifier `classify.py`, metric `metric.py`, and LSTM based camera motion generator `LSTM.py`. The training and testing dataset are separated with 9:1 ratio randomly.
|
46 |
+
|
47 |
+
## Acknowledgement
|
48 |
+
|
49 |
+
This code is standing on the shoulders of giants. We want to thank the following contributors that our code is based on:
|
50 |
+
|
51 |
+
[Conditional Diffusion MNIST](https://github.com/TeaPearce/Conditional_Diffusion_MNIST), [MDM: Human Motion Diffusion Model](https://github.com/GuyTevet/motion-diffusion-model).
|
CCD/src/classify.py
ADDED
@@ -0,0 +1,368 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This script does conditional image generation on MNIST, using a diffusion model
|
3 |
+
|
4 |
+
This code is modified from,
|
5 |
+
https://github.com/cloneofsimo/minDiffusion
|
6 |
+
|
7 |
+
Diffusion model is based on DDPM,
|
8 |
+
https://arxiv.org/abs/2006.11239
|
9 |
+
|
10 |
+
The conditioning idea is taken from 'Classifier-Free Diffusion Guidance',
|
11 |
+
https://arxiv.org/abs/2207.12598
|
12 |
+
|
13 |
+
This technique also features in ImageGen 'Photorealistic Text-to-Image Diffusion Modelswith Deep Language Understanding',
|
14 |
+
https://arxiv.org/abs/2205.11487
|
15 |
+
|
16 |
+
'''
|
17 |
+
|
18 |
+
from typing import Dict, Tuple
|
19 |
+
from tqdm import tqdm
|
20 |
+
import torch
|
21 |
+
import torch.nn as nn
|
22 |
+
import torch.nn.functional as F
|
23 |
+
from torch.utils.data import DataLoader
|
24 |
+
from torchvision import models, transforms
|
25 |
+
from torchvision.datasets import MNIST
|
26 |
+
from torchvision.utils import save_image, make_grid
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
from matplotlib.animation import FuncAnimation, PillowWriter
|
29 |
+
import numpy as np
|
30 |
+
import os
|
31 |
+
import clip
|
32 |
+
|
33 |
+
class PositionalEncoding(nn.Module):
|
34 |
+
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
35 |
+
super(PositionalEncoding, self).__init__()
|
36 |
+
self.dropout = nn.Dropout(p=dropout)
|
37 |
+
|
38 |
+
pe = torch.zeros(max_len, d_model)
|
39 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
40 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
|
41 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
42 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
43 |
+
pe = pe.unsqueeze(0).transpose(0, 1)
|
44 |
+
self.register_buffer('pe', pe)
|
45 |
+
|
46 |
+
def forward(self, x):
|
47 |
+
# not used in the final model
|
48 |
+
x = x + self.pe[:x.shape[0], :]
|
49 |
+
return self.dropout(x)
|
50 |
+
|
51 |
+
class TimestepEmbedder(nn.Module):
|
52 |
+
def __init__(self, latent_dim, sequence_pos_encoder):
|
53 |
+
super().__init__()
|
54 |
+
self.latent_dim = latent_dim
|
55 |
+
self.sequence_pos_encoder = sequence_pos_encoder
|
56 |
+
|
57 |
+
time_embed_dim = self.latent_dim
|
58 |
+
self.time_embed = nn.Sequential(
|
59 |
+
nn.Linear(self.latent_dim, time_embed_dim),
|
60 |
+
nn.SiLU(),
|
61 |
+
nn.Linear(time_embed_dim, time_embed_dim),
|
62 |
+
)
|
63 |
+
|
64 |
+
def forward(self, timesteps):
|
65 |
+
return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)
|
66 |
+
|
67 |
+
class Transformer(nn.Module):
|
68 |
+
def __init__(self, n_feature, n_label, latent_dim=256,
|
69 |
+
num_heads=4, ff_size=1024, dropout=0.1, activation='gelu',
|
70 |
+
num_layers=4, sliding_wind=300):
|
71 |
+
super(Transformer, self).__init__()
|
72 |
+
|
73 |
+
self.n_feature = n_feature
|
74 |
+
self.n_label = n_label
|
75 |
+
self.num_heads = num_heads
|
76 |
+
self.ff_size = ff_size
|
77 |
+
self.dropout = dropout
|
78 |
+
self.activation = activation
|
79 |
+
self.num_layers = num_layers
|
80 |
+
self.latent_dim = latent_dim
|
81 |
+
|
82 |
+
self.input_process = nn.Linear(self.n_feature, self.latent_dim)
|
83 |
+
|
84 |
+
seqTransEncoderlayer = nn.TransformerEncoderLayer(d_model=self.latent_dim,
|
85 |
+
nhead = self.num_heads,
|
86 |
+
dim_feedforward = self.ff_size,
|
87 |
+
dropout = self.dropout,
|
88 |
+
activation=self.activation)
|
89 |
+
|
90 |
+
self.seqTransEncoder = nn.TransformerEncoder(seqTransEncoderlayer,
|
91 |
+
num_layers = self.num_layers)
|
92 |
+
|
93 |
+
self.sequence_pos_encoder = PositionalEncoding(self.latent_dim, self.dropout)
|
94 |
+
self.embed_timestep = TimestepEmbedder(self.latent_dim, self.sequence_pos_encoder)
|
95 |
+
|
96 |
+
self.output_process = nn.Sequential(
|
97 |
+
nn.Linear(self.latent_dim, 1),
|
98 |
+
nn.ReLU()
|
99 |
+
)
|
100 |
+
self.pred = nn.Sequential(
|
101 |
+
nn.Linear(sliding_wind, n_label),
|
102 |
+
# nn.Softmax(dim=1),
|
103 |
+
)
|
104 |
+
|
105 |
+
|
106 |
+
def forward(self, x):
|
107 |
+
bs = len(x)
|
108 |
+
x = self.input_process(x.permute(1, 0, 2))
|
109 |
+
|
110 |
+
xseq = self.sequence_pos_encoder(x)
|
111 |
+
xseq = self.seqTransEncoder(xseq)
|
112 |
+
xseq = self.output_process(xseq).permute(1, 0, 2)
|
113 |
+
|
114 |
+
xseq = xseq.view(bs, -1)
|
115 |
+
|
116 |
+
return self.pred(xseq)
|
117 |
+
|
118 |
+
def forward_feature(self, x):
|
119 |
+
bs = len(x)
|
120 |
+
x = self.input_process(x.permute(1, 0, 2))
|
121 |
+
|
122 |
+
xseq = self.sequence_pos_encoder(x)
|
123 |
+
xseq = self.seqTransEncoder(xseq)
|
124 |
+
xseq = self.output_process(xseq).permute(1, 0, 2)
|
125 |
+
|
126 |
+
return xseq.view(bs, -1)
|
127 |
+
|
128 |
+
import torch.utils.data as data
|
129 |
+
class camdataset(data.Dataset):
|
130 |
+
def __init__(self, cam, label):
|
131 |
+
self.cam = cam
|
132 |
+
self.label = label
|
133 |
+
|
134 |
+
def __getitem__(self, index):
|
135 |
+
d = self.cam[index]
|
136 |
+
data = np.concatenate((d, d[-1:].repeat(300-len(d), 0)), 0)
|
137 |
+
return np.array(data, dtype="float32"), self.label[index]
|
138 |
+
|
139 |
+
def __len__(self):
|
140 |
+
return len(self.cam)
|
141 |
+
|
142 |
+
|
143 |
+
def train_mnist():
|
144 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
145 |
+
|
146 |
+
d = np.concatenate(data["train_cam"]+data["test_cam"], 0)
|
147 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
148 |
+
|
149 |
+
np.save("Mean_Std", {"Mean": Mean, "Std": Std})
|
150 |
+
|
151 |
+
for i in range(len(data["train_cam"])):
|
152 |
+
data["train_cam"][i] = (data["train_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8)
|
153 |
+
|
154 |
+
for i in range(len(data["test_cam"])):
|
155 |
+
data["test_cam"][i] = (data["test_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8)
|
156 |
+
|
157 |
+
# hardcoding these here
|
158 |
+
n_epoch = 1000
|
159 |
+
batch_size = 128
|
160 |
+
device = "cuda:0"
|
161 |
+
n_feature = 5
|
162 |
+
n_label = 6
|
163 |
+
lrate = 1e-4
|
164 |
+
save_model = True
|
165 |
+
save_dir = './result/'
|
166 |
+
if not os.path.exists(save_dir):
|
167 |
+
os.mkdir(save_dir)
|
168 |
+
|
169 |
+
criterion = torch.nn.CrossEntropyLoss()
|
170 |
+
trans = Transformer(n_feature=n_feature, n_label=n_label)
|
171 |
+
trans.to(device)
|
172 |
+
|
173 |
+
optim = torch.optim.Adam(trans.parameters(), lr=lrate)
|
174 |
+
|
175 |
+
dataloader = DataLoader(camdataset(data['train_cam'], data['train_label']), batch_size=batch_size, shuffle=True, num_workers=5)
|
176 |
+
testloader = DataLoader(camdataset(data['test_cam'], data['test_label']), batch_size=batch_size, shuffle=False, num_workers=5)
|
177 |
+
|
178 |
+
if not os.path.exists("result"):
|
179 |
+
os.mkdir("result")
|
180 |
+
|
181 |
+
for ep in range(n_epoch):
|
182 |
+
print(f'epoch {ep}')
|
183 |
+
|
184 |
+
# linear lrate decay
|
185 |
+
optim.param_groups[0]['lr'] = lrate*(1-ep/n_epoch)
|
186 |
+
|
187 |
+
pbar = tqdm(dataloader)
|
188 |
+
|
189 |
+
trans.train()
|
190 |
+
correct = 0
|
191 |
+
total = 0
|
192 |
+
for cam, label in pbar:
|
193 |
+
cam = cam.to(device)
|
194 |
+
label = label.to(device)
|
195 |
+
|
196 |
+
pred_v = trans(cam)
|
197 |
+
|
198 |
+
predictions = torch.argmax(pred_v, dim=1)
|
199 |
+
correct += torch.sum(predictions == label).item()
|
200 |
+
total += len(predictions)
|
201 |
+
|
202 |
+
optim.zero_grad()
|
203 |
+
loss = criterion(pred_v, label)
|
204 |
+
loss.backward()
|
205 |
+
|
206 |
+
pbar.set_description(f"training acc: {100.0 * correct/total:.4f}")
|
207 |
+
optim.step()
|
208 |
+
|
209 |
+
trans.eval()
|
210 |
+
correct = 0
|
211 |
+
total = 0
|
212 |
+
for cam, label in testloader:
|
213 |
+
cam = cam.to(device)
|
214 |
+
label = label.to(device)
|
215 |
+
|
216 |
+
pred_v = trans(cam)
|
217 |
+
predictions = torch.argmax(pred_v, dim=1)
|
218 |
+
|
219 |
+
correct += torch.sum(predictions == label)
|
220 |
+
total += len(predictions)
|
221 |
+
print("evaluation accuracy : {}".format(1.0 * correct / total))
|
222 |
+
|
223 |
+
torch.save(trans.state_dict(), save_dir + f"latest.pth")
|
224 |
+
if save_model and ep % 100 == 0:
|
225 |
+
torch.save(trans.state_dict(), save_dir + f"model_{ep}.pth")
|
226 |
+
print('saved model at ' + save_dir + f"model_{ep}.pth")
|
227 |
+
|
228 |
+
def eval_mnist(file_name):
|
229 |
+
if not os.path.exists("Mean_Std.npy"):
|
230 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
231 |
+
|
232 |
+
d = np.concatenate(data["train_cam"] + data["test_cam"], 0)
|
233 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
234 |
+
np.save("Mean_Std", {"Mean":Mean, "Std":Std})
|
235 |
+
|
236 |
+
d = np.load("Mean_Std.npy", allow_pickle=True)[()]
|
237 |
+
Mean, Std = d["Mean"], d["Std"]
|
238 |
+
|
239 |
+
data = np.load(file_name+".npy", allow_pickle=True)[()]
|
240 |
+
|
241 |
+
for i in range(len(data["result"])):
|
242 |
+
data["result"][i] = (data["result"][i] - Mean[None, :]) / (Std[None, :]+1e-8)
|
243 |
+
|
244 |
+
device = "cuda:0"
|
245 |
+
n_feature = 5
|
246 |
+
n_label = 6
|
247 |
+
|
248 |
+
trans = Transformer(n_feature=n_feature, n_label=n_label)
|
249 |
+
trans.to(device)
|
250 |
+
|
251 |
+
# optionally load a model
|
252 |
+
trans.load_state_dict(torch.load("./result/latest.pth"))
|
253 |
+
|
254 |
+
testloader = DataLoader(camdataset(data['result'], data['label']), batch_size=8, num_workers=5)
|
255 |
+
|
256 |
+
correct = 0
|
257 |
+
total = 0
|
258 |
+
t = [0] * 10
|
259 |
+
f = [0] * 10
|
260 |
+
trans.eval()
|
261 |
+
with torch.no_grad():
|
262 |
+
for cam, label in tqdm(testloader):
|
263 |
+
cam = cam.to(device)
|
264 |
+
label = label.to(device)
|
265 |
+
|
266 |
+
pred_v = trans(cam)
|
267 |
+
predictions = torch.argmax(pred_v, dim=1)
|
268 |
+
|
269 |
+
correct += torch.sum(predictions == label)
|
270 |
+
total += len(predictions)
|
271 |
+
|
272 |
+
for i in range(len(predictions)):
|
273 |
+
if predictions[i] == label[i]:
|
274 |
+
t[label[i]] += 1
|
275 |
+
else:
|
276 |
+
f[label[i]] += 1
|
277 |
+
|
278 |
+
print("gen accuracy : {}/{}={} ".format(correct, total, 1.0 * correct / total))
|
279 |
+
for i in range(n_label):
|
280 |
+
print("{} {} {}".format(i, t[i], t[i]+f[i]))
|
281 |
+
|
282 |
+
def process_feature(file_list):
|
283 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
284 |
+
|
285 |
+
d = np.concatenate(data["train_cam"] + data["test_cam"], 0)
|
286 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
287 |
+
|
288 |
+
for i in range(len(data["train_cam"])):
|
289 |
+
data["train_cam"][i] = (data["train_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8)
|
290 |
+
|
291 |
+
for i in range(len(data["test_cam"])):
|
292 |
+
data["test_cam"][i] = (data["test_cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8)
|
293 |
+
|
294 |
+
device = "cuda:0"
|
295 |
+
n_feature = 5
|
296 |
+
n_label = 6
|
297 |
+
|
298 |
+
trans = Transformer(n_feature=n_feature, n_label=n_label)
|
299 |
+
trans.to(device)
|
300 |
+
|
301 |
+
# optionally load a model
|
302 |
+
trans.load_state_dict(torch.load("./result/latest.pth"))
|
303 |
+
|
304 |
+
trans.eval()
|
305 |
+
|
306 |
+
d = dict()
|
307 |
+
|
308 |
+
testloader = DataLoader(camdataset(data['train_cam'], data['train_label']), batch_size=8, num_workers=5)
|
309 |
+
|
310 |
+
feature = []
|
311 |
+
|
312 |
+
with torch.no_grad():
|
313 |
+
for cam, label in tqdm(testloader):
|
314 |
+
cam = cam.to(device)
|
315 |
+
|
316 |
+
pred_v = trans.forward_feature(cam).detach().cpu().numpy()
|
317 |
+
|
318 |
+
for v in pred_v:
|
319 |
+
feature.append(v)
|
320 |
+
|
321 |
+
d["train_data"] = feature
|
322 |
+
|
323 |
+
testloader = DataLoader(camdataset(data['test_cam'], data['test_label']), batch_size=8, num_workers=5)
|
324 |
+
|
325 |
+
feature = []
|
326 |
+
|
327 |
+
with torch.no_grad():
|
328 |
+
for cam, label in tqdm(testloader):
|
329 |
+
cam = cam.to(device)
|
330 |
+
|
331 |
+
pred_v = trans.forward_feature(cam).detach().cpu().numpy()
|
332 |
+
|
333 |
+
for v in pred_v:
|
334 |
+
feature.append(v)
|
335 |
+
|
336 |
+
d["test_data"] = feature
|
337 |
+
|
338 |
+
|
339 |
+
for file in file_list:
|
340 |
+
data = np.load(file+".npy", allow_pickle=True)[()]
|
341 |
+
|
342 |
+
for i in range(len(data["result"])):
|
343 |
+
data["result"][i] = (data["result"][i] - Mean[None, :]) / (Std[None, :] + 1e-8)
|
344 |
+
|
345 |
+
testloader = DataLoader(camdataset(data['result'], data['label']), batch_size=8, num_workers=5)
|
346 |
+
|
347 |
+
feature = []
|
348 |
+
|
349 |
+
with torch.no_grad():
|
350 |
+
for cam, label in tqdm(testloader):
|
351 |
+
cam = cam.to(device)
|
352 |
+
|
353 |
+
pred_v = trans.forward_feature(cam).detach().cpu().numpy()
|
354 |
+
|
355 |
+
for v in pred_v:
|
356 |
+
feature.append(v)
|
357 |
+
|
358 |
+
d[file] = feature
|
359 |
+
|
360 |
+
np.save("feature", d)
|
361 |
+
|
362 |
+
|
363 |
+
if __name__ == "__main__":
|
364 |
+
train_mnist()
|
365 |
+
#
|
366 |
+
# eval_mnist()
|
367 |
+
|
368 |
+
# process_feature()
|
CCD/src/main.py
ADDED
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
This script does conditional image generation on MNIST, using a diffusion model
|
3 |
+
|
4 |
+
This code is modified from,
|
5 |
+
https://github.com/cloneofsimo/minDiffusion
|
6 |
+
|
7 |
+
Diffusion model is based on DDPM,
|
8 |
+
https://arxiv.org/abs/2006.11239
|
9 |
+
|
10 |
+
The conditioning idea is taken from 'Classifier-Free Diffusion Guidance',
|
11 |
+
https://arxiv.org/abs/2207.12598
|
12 |
+
|
13 |
+
This technique also features in ImageGen 'Photorealistic Text-to-Image Diffusion Modelswith Deep Language Understanding',
|
14 |
+
https://arxiv.org/abs/2205.11487
|
15 |
+
|
16 |
+
'''
|
17 |
+
import random
|
18 |
+
from typing import Dict, Tuple
|
19 |
+
from tqdm import tqdm
|
20 |
+
import torch
|
21 |
+
import torch.nn as nn
|
22 |
+
import torch.nn.functional as F
|
23 |
+
from torch.utils.data import DataLoader
|
24 |
+
from torchvision import models, transforms
|
25 |
+
from torchvision.datasets import MNIST
|
26 |
+
from torchvision.utils import save_image, make_grid
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
from matplotlib.animation import FuncAnimation, PillowWriter
|
29 |
+
import numpy as np
|
30 |
+
import os
|
31 |
+
import clip
|
32 |
+
|
33 |
+
class PositionalEncoding(nn.Module):
|
34 |
+
def __init__(self, d_model, dropout=0.1, max_len=5000):
|
35 |
+
super(PositionalEncoding, self).__init__()
|
36 |
+
self.dropout = nn.Dropout(p=dropout)
|
37 |
+
|
38 |
+
pe = torch.zeros(max_len, d_model)
|
39 |
+
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
|
40 |
+
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
|
41 |
+
pe[:, 0::2] = torch.sin(position * div_term)
|
42 |
+
pe[:, 1::2] = torch.cos(position * div_term)
|
43 |
+
pe = pe.unsqueeze(0).transpose(0, 1)
|
44 |
+
self.register_buffer('pe', pe)
|
45 |
+
|
46 |
+
def forward(self, x):
|
47 |
+
# not used in the final model
|
48 |
+
x = x + self.pe[:x.shape[0], :]
|
49 |
+
return self.dropout(x)
|
50 |
+
|
51 |
+
class TimestepEmbedder(nn.Module):
|
52 |
+
def __init__(self, latent_dim, sequence_pos_encoder):
|
53 |
+
super().__init__()
|
54 |
+
self.latent_dim = latent_dim
|
55 |
+
self.sequence_pos_encoder = sequence_pos_encoder
|
56 |
+
|
57 |
+
time_embed_dim = self.latent_dim
|
58 |
+
self.time_embed = nn.Sequential(
|
59 |
+
nn.Linear(self.latent_dim, time_embed_dim),
|
60 |
+
nn.SiLU(),
|
61 |
+
nn.Linear(time_embed_dim, time_embed_dim),
|
62 |
+
)
|
63 |
+
|
64 |
+
def forward(self, timesteps):
|
65 |
+
return self.time_embed(self.sequence_pos_encoder.pe[timesteps]).permute(1, 0, 2)
|
66 |
+
|
67 |
+
class Transformer(nn.Module):
|
68 |
+
def __init__(self, n_feature, n_textemb, latent_dim=256,
|
69 |
+
num_heads=4, ff_size=1024, dropout=0.1, activation='gelu',
|
70 |
+
num_layers=4, cond_mask_prob=0.1):
|
71 |
+
super(Transformer, self).__init__()
|
72 |
+
|
73 |
+
self.n_feature = n_feature
|
74 |
+
self.n_textemb = n_textemb
|
75 |
+
self.num_heads = num_heads
|
76 |
+
self.ff_size = ff_size
|
77 |
+
self.dropout = dropout
|
78 |
+
self.activation = activation
|
79 |
+
self.num_layers = num_layers
|
80 |
+
self.latent_dim = latent_dim
|
81 |
+
self.cond_mask_prob = cond_mask_prob
|
82 |
+
|
83 |
+
self.embed_text = nn.Linear(self.n_textemb, self.latent_dim)
|
84 |
+
|
85 |
+
self.input_process = nn.Linear(self.n_feature, self.latent_dim)
|
86 |
+
|
87 |
+
seqTransEncoderlayer = nn.TransformerEncoderLayer(d_model=self.latent_dim,
|
88 |
+
nhead = self.num_heads,
|
89 |
+
dim_feedforward = self.ff_size,
|
90 |
+
dropout = self.dropout,
|
91 |
+
activation=self.activation)
|
92 |
+
|
93 |
+
self.seqTransEncoder = nn.TransformerEncoder(seqTransEncoderlayer,
|
94 |
+
num_layers = self.num_layers)
|
95 |
+
|
96 |
+
self.sequence_pos_encoder = PositionalEncoding(self.latent_dim, self.dropout)
|
97 |
+
self.embed_timestep = TimestepEmbedder(self.latent_dim, self.sequence_pos_encoder)
|
98 |
+
|
99 |
+
self.output_process = nn.Linear(self.latent_dim, self.n_feature)
|
100 |
+
|
101 |
+
def mask_cond(self, cond, force_mask=False):
|
102 |
+
bs, d = cond.shape
|
103 |
+
if force_mask:
|
104 |
+
return torch.zeros_like(cond)
|
105 |
+
elif self.training and self.cond_mask_prob > 0.:
|
106 |
+
mask = torch.bernoulli(torch.ones(bs, device=cond.device) * self.cond_mask_prob).view(bs, 1) # 1-> use null_cond, 0-> use real cond
|
107 |
+
return cond * (1. - mask)
|
108 |
+
else:
|
109 |
+
return cond
|
110 |
+
|
111 |
+
def forward(self, x, emb_text, timesteps, force_mask=False):
|
112 |
+
emb_time = self.embed_timestep(timesteps)
|
113 |
+
|
114 |
+
emb_text = self.embed_text(self.mask_cond(emb_text, force_mask=force_mask))
|
115 |
+
emb = (emb_time + emb_text)
|
116 |
+
|
117 |
+
x = self.input_process(x.permute(1, 0, 2))
|
118 |
+
|
119 |
+
xseq = torch.cat((emb, x), axis=0)
|
120 |
+
xseq = self.sequence_pos_encoder(xseq)
|
121 |
+
output = self.seqTransEncoder(xseq)[1:]
|
122 |
+
|
123 |
+
return self.output_process(output).permute(1, 0, 2)
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
def ddpm_schedules(beta1, beta2, T):
|
128 |
+
"""
|
129 |
+
Returns pre-computed schedules for DDPM sampling, training process.
|
130 |
+
"""
|
131 |
+
assert beta1 < beta2 < 1.0, "beta1 and beta2 must be in (0, 1)"
|
132 |
+
|
133 |
+
beta_t = (beta2 - beta1) * torch.arange(0, T + 1, dtype=torch.float32) / T + beta1
|
134 |
+
sqrt_beta_t = torch.sqrt(beta_t)
|
135 |
+
alpha_t = 1 - beta_t
|
136 |
+
log_alpha_t = torch.log(alpha_t)
|
137 |
+
alphabar_t = torch.cumsum(log_alpha_t, dim=0).exp()
|
138 |
+
|
139 |
+
sqrtab = torch.sqrt(alphabar_t)
|
140 |
+
oneover_sqrta = 1 / torch.sqrt(alpha_t)
|
141 |
+
|
142 |
+
sqrtmab = torch.sqrt(1 - alphabar_t)
|
143 |
+
mab_over_sqrtmab_inv = (1 - alpha_t) / sqrtmab
|
144 |
+
|
145 |
+
return {
|
146 |
+
"alpha_t": alpha_t, # \alpha_t
|
147 |
+
"oneover_sqrta": oneover_sqrta, # 1/\sqrt{\alpha_t}
|
148 |
+
"sqrt_beta_t": sqrt_beta_t, # \sqrt{\beta_t}
|
149 |
+
"alphabar_t": alphabar_t, # \bar{\alpha_t}
|
150 |
+
"sqrtab": sqrtab, # \sqrt{\bar{\alpha_t}}
|
151 |
+
"sqrtmab": sqrtmab, # \sqrt{1-\bar{\alpha_t}}
|
152 |
+
"mab_over_sqrtmab": mab_over_sqrtmab_inv, # (1-\alpha_t)/\sqrt{1-\bar{\alpha_t}}
|
153 |
+
}
|
154 |
+
|
155 |
+
|
156 |
+
class DDPM(nn.Module):
|
157 |
+
def __init__(self, nn_model, betas, n_T, device):
|
158 |
+
super(DDPM, self).__init__()
|
159 |
+
self.nn_model = nn_model.to(device)
|
160 |
+
|
161 |
+
# register_buffer allows accessing dictionary produced by ddpm_schedules
|
162 |
+
# e.g. can access self.sqrtab later
|
163 |
+
for k, v in ddpm_schedules(betas[0], betas[1], n_T).items():
|
164 |
+
self.register_buffer(k, v)
|
165 |
+
|
166 |
+
self.n_T = n_T
|
167 |
+
self.device = device
|
168 |
+
self.loss_mse = nn.MSELoss()
|
169 |
+
|
170 |
+
self.count = [0] * n_T
|
171 |
+
|
172 |
+
def forward(self, x, c):
|
173 |
+
"""
|
174 |
+
this method is used in training, so samples t and noise randomly
|
175 |
+
"""
|
176 |
+
|
177 |
+
_ts = torch.randint(1, self.n_T, (x.shape[0],)).to(self.device) # t ~ Uniform(0, n_T)
|
178 |
+
noise = torch.randn_like(x) # eps ~ N(0, 1)
|
179 |
+
|
180 |
+
for t in _ts:
|
181 |
+
self.count[t] += 1
|
182 |
+
|
183 |
+
x_t = (
|
184 |
+
self.sqrtab[_ts, None, None] * x
|
185 |
+
+ self.sqrtmab[_ts, None, None] * noise
|
186 |
+
) # This is the x_t, which is sqrt(alphabar) x_0 + sqrt(1-alphabar) * eps
|
187 |
+
# We should predict the "error term" from this x_t. Loss is what we return.
|
188 |
+
|
189 |
+
# return MSE between added noise, and our predicted noise
|
190 |
+
return self.loss_mse(noise, self.nn_model(x_t, c, _ts))
|
191 |
+
|
192 |
+
def sample(self, n_sample, c, size, device, guide_w):
|
193 |
+
# we follow the guidance sampling scheme described in 'Classifier-Free Diffusion Guidance'
|
194 |
+
# to make the fwd passes efficient, we concat two versions of the dataset,
|
195 |
+
# one with context_mask=0 and the other context_mask=1
|
196 |
+
# we then mix the outputs with the guidance scale, w
|
197 |
+
# where w>0 means more guidance
|
198 |
+
|
199 |
+
x_i = torch.randn(n_sample, *size).to(device) # x_T ~ N(0, 1), sample initial noise
|
200 |
+
|
201 |
+
if c.shape[0] == 1:
|
202 |
+
c_i = c.repeat(n_sample, 1).float()
|
203 |
+
else:
|
204 |
+
c_i = c.float()
|
205 |
+
|
206 |
+
for i in tqdm(range(self.n_T, 0, -1)):
|
207 |
+
t_is = torch.tensor(i).to(device).repeat(n_sample)
|
208 |
+
|
209 |
+
# split predictions and compute weighting
|
210 |
+
eps1 = self.nn_model(x_i, c_i, t_is)
|
211 |
+
eps2 = self.nn_model(x_i, c_i, t_is, force_mask=True)
|
212 |
+
eps = eps2 + guide_w * (eps1 - eps2)
|
213 |
+
|
214 |
+
z = torch.randn(n_sample, *size).to(device) if i > 1 else 0
|
215 |
+
|
216 |
+
|
217 |
+
x_i = (
|
218 |
+
self.oneover_sqrta[i] * (x_i - eps * self.mab_over_sqrtmab[i])
|
219 |
+
+ self.sqrt_beta_t[i] * z
|
220 |
+
)
|
221 |
+
|
222 |
+
return x_i
|
223 |
+
|
224 |
+
|
225 |
+
|
226 |
+
import torch.utils.data as data
|
227 |
+
class camdataset(data.Dataset):
|
228 |
+
def __init__(self, data, label):
|
229 |
+
self.data = data
|
230 |
+
self.label = label
|
231 |
+
|
232 |
+
def __getitem__(self, index):
|
233 |
+
text = np.random.choice(self.label[index], np.random.randint(1, len(self.label[index])+1), replace=False)
|
234 |
+
|
235 |
+
d = self.data[index]
|
236 |
+
d = np.concatenate((d, d[-1:].repeat(300-len(d), 0)), 0)
|
237 |
+
|
238 |
+
return np.array(d, dtype="float32"), " ".join(text)
|
239 |
+
|
240 |
+
def __len__(self):
|
241 |
+
return len(self.data)
|
242 |
+
|
243 |
+
|
244 |
+
def train():
|
245 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
246 |
+
|
247 |
+
d = np.concatenate(data["cam"], 0)
|
248 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
249 |
+
|
250 |
+
for i in range(len(data["cam"])):
|
251 |
+
data["cam"][i] = (data["cam"][i] - Mean[None, :]) / (Std[None, :]+1e-8)
|
252 |
+
|
253 |
+
# hardcoding these here
|
254 |
+
n_epoch = 20000
|
255 |
+
batch_size = 256
|
256 |
+
n_T = 1000 # 500
|
257 |
+
device = "cuda:0"
|
258 |
+
n_feature = 5
|
259 |
+
n_textemb = 512
|
260 |
+
lrate = 1e-4
|
261 |
+
save_model = True
|
262 |
+
save_dir = './weight/'
|
263 |
+
if not os.path.exists(save_dir):
|
264 |
+
os.mkdir(save_dir)
|
265 |
+
|
266 |
+
ddpm = DDPM(nn_model=Transformer(n_feature=n_feature, n_textemb=n_textemb), betas=(1e-4, 0.02), n_T=n_T, device=device)
|
267 |
+
ddpm.to(device)
|
268 |
+
|
269 |
+
optim = torch.optim.Adam(ddpm.parameters(), lr=lrate)
|
270 |
+
|
271 |
+
dataloader = DataLoader(camdataset(data['cam'], data['info']), batch_size=batch_size, shuffle=True, num_workers=5)
|
272 |
+
|
273 |
+
if not os.path.exists("result"):
|
274 |
+
os.mkdir("result")
|
275 |
+
|
276 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
277 |
+
model, preprocess = clip.load("ViT-B/32", device=device)
|
278 |
+
|
279 |
+
for ep in range(n_epoch):
|
280 |
+
print(f'epoch {ep}')
|
281 |
+
ddpm.train()
|
282 |
+
|
283 |
+
# linear lrate decay
|
284 |
+
optim.param_groups[0]['lr'] = lrate*(1-ep/n_epoch)
|
285 |
+
|
286 |
+
pbar = tqdm(dataloader)
|
287 |
+
loss_ema = None
|
288 |
+
for x, c in pbar:
|
289 |
+
optim.zero_grad()
|
290 |
+
x = x.to(device)
|
291 |
+
with torch.no_grad():
|
292 |
+
c = clip.tokenize(c, truncate=True).to(device)
|
293 |
+
c = model.encode_text(c).detach()
|
294 |
+
|
295 |
+
loss = ddpm(x, c)
|
296 |
+
loss.backward()
|
297 |
+
if loss_ema is None:
|
298 |
+
loss_ema = loss.item()
|
299 |
+
else:
|
300 |
+
loss_ema = 0.95 * loss_ema + 0.05 * loss.item()
|
301 |
+
pbar.set_description(f"loss: {loss_ema:.4f}")
|
302 |
+
optim.step()
|
303 |
+
|
304 |
+
torch.save(ddpm.state_dict(), save_dir + f"latest.pth")
|
305 |
+
if save_model and ep % 100 == 0:
|
306 |
+
torch.save(ddpm.state_dict(), save_dir + f"model_{ep}.pth")
|
307 |
+
print('saved model at ' + save_dir + f"model_{ep}.pth")
|
308 |
+
|
309 |
+
|
310 |
+
def gen(text: str):
|
311 |
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
312 |
+
|
313 |
+
mean_std_path = os.path.join(script_dir, "..", "checkpoints", "Mean_Std.npy")
|
314 |
+
|
315 |
+
latest_path = os.path.join(script_dir, "..", "checkpoints", "latest.pth")
|
316 |
+
|
317 |
+
if not os.path.exists(mean_std_path):
|
318 |
+
data = np.load("data.npy", allow_pickle=True)[()]
|
319 |
+
|
320 |
+
d = np.concatenate(data["cam"], 0)
|
321 |
+
Mean, Std = np.mean(d, 0), np.std(d, 0)
|
322 |
+
np.save("Mean_Std", {"Mean": Mean, "Std": Std})
|
323 |
+
|
324 |
+
d = np.load(mean_std_path, allow_pickle=True)[()]
|
325 |
+
Mean, Std = d["Mean"], d["Std"]
|
326 |
+
|
327 |
+
n_T = 1000 # 500
|
328 |
+
device = "cuda:0"
|
329 |
+
n_feature = 5
|
330 |
+
n_textemb = 512
|
331 |
+
|
332 |
+
ddpm = DDPM(nn_model=Transformer(n_feature=n_feature, n_textemb=n_textemb), betas=(1e-4, 0.02), n_T=n_T,
|
333 |
+
device=device)
|
334 |
+
ddpm.to(device)
|
335 |
+
|
336 |
+
# optionally load a model
|
337 |
+
ddpm.load_state_dict(torch.load(latest_path))
|
338 |
+
|
339 |
+
if not os.path.exists("gen"):
|
340 |
+
os.mkdir("gen")
|
341 |
+
|
342 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
343 |
+
model, preprocess = clip.load("ViT-B/32", device=device)
|
344 |
+
|
345 |
+
#text = ["The camera pans to the character. The camera switches from right front view to right back view. The character is at the middle center of the screen. The camera shoots at close shot."]
|
346 |
+
|
347 |
+
result = []
|
348 |
+
|
349 |
+
def smooth(x, winds=10, T=4):
|
350 |
+
if T == 0:
|
351 |
+
return x
|
352 |
+
n_x = np.array(x)
|
353 |
+
for i in range(len(x)):
|
354 |
+
n_x[i] = np.mean(x[max(0, i - winds):min(len(x), i + winds), :], 0)
|
355 |
+
return smooth(n_x, T=T - 1)
|
356 |
+
|
357 |
+
with torch.no_grad():
|
358 |
+
c = clip.tokenize(text, truncate=True).to(device)
|
359 |
+
c = model.encode_text(c)
|
360 |
+
|
361 |
+
sample = ddpm.sample(10, c, (300, n_feature), device, guide_w=2.0)
|
362 |
+
sample = sample.detach().cpu().numpy()
|
363 |
+
|
364 |
+
for j in range(len(sample)):
|
365 |
+
s = smooth(sample[j] * Std[None, :] + Mean[None, :])
|
366 |
+
result.append(s)
|
367 |
+
return result
|
368 |
+
# with open("gen/{}.txt".format(j), "w") as f:
|
369 |
+
# for i in range(len(s)):
|
370 |
+
# txt = ""
|
371 |
+
# for k in range(5):
|
372 |
+
# txt += str(s[i][k]) + " "
|
373 |
+
# f.write(txt+"\n")
|
374 |
+
|
375 |
+
|
376 |
+
def generate_CCD_sample(text: str):
|
377 |
+
return gen(text)
|
378 |
+
|
379 |
+
if __name__ == "__main__":
|
380 |
+
import sys
|
381 |
+
mode = sys.argv[1]
|
382 |
+
|
383 |
+
if mode == 'train':
|
384 |
+
train()
|
385 |
+
elif mode == 'gen':
|
386 |
+
gen()
|
387 |
+
else:
|
388 |
+
print('Error, instruction {} is not in {train, gen}')
|
389 |
+
|
CCD/src/metric.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import torch
|
3 |
+
import matplotlib.pyplot as plt
|
4 |
+
from scipy import linalg
|
5 |
+
import os
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
|
9 |
+
"""Numpy implementation of the Frechet Distance.
|
10 |
+
The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
|
11 |
+
and X_2 ~ N(mu_2, C_2) is
|
12 |
+
d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
|
13 |
+
Stable version by Dougal J. Sutherland.
|
14 |
+
Params:
|
15 |
+
-- mu1 : Numpy array containing the activations of a layer of the
|
16 |
+
inception net (like returned by the function 'get_predictions')
|
17 |
+
for generated samples.
|
18 |
+
-- mu2 : The sample mean over activations, precalculated on an
|
19 |
+
representative data set.
|
20 |
+
-- sigma1: The covariance matrix over activations for generated samples.
|
21 |
+
-- sigma2: The covariance matrix over activations, precalculated on an
|
22 |
+
representative data set.
|
23 |
+
Returns:
|
24 |
+
-- : The Frechet Distance.
|
25 |
+
"""
|
26 |
+
|
27 |
+
mu1 = np.atleast_1d(mu1)
|
28 |
+
mu2 = np.atleast_1d(mu2)
|
29 |
+
|
30 |
+
sigma1 = np.atleast_2d(sigma1)
|
31 |
+
sigma2 = np.atleast_2d(sigma2)
|
32 |
+
|
33 |
+
assert mu1.shape == mu2.shape, \
|
34 |
+
'Training and test mean vectors have different lengths'
|
35 |
+
assert sigma1.shape == sigma2.shape, \
|
36 |
+
'Training and test covariances have different dimensions'
|
37 |
+
|
38 |
+
diff = mu1 - mu2
|
39 |
+
|
40 |
+
# Product might be almost singular
|
41 |
+
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
|
42 |
+
if not np.isfinite(covmean).all():
|
43 |
+
msg = ('fid calculation produces singular product; '
|
44 |
+
'adding %s to diagonal of cov estimates') % eps
|
45 |
+
print(msg)
|
46 |
+
offset = np.eye(sigma1.shape[0]) * eps
|
47 |
+
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
|
48 |
+
|
49 |
+
# Numerical error might give slight imaginary component
|
50 |
+
if np.iscomplexobj(covmean):
|
51 |
+
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
|
52 |
+
m = np.max(np.abs(covmean.imag))
|
53 |
+
raise ValueError('Imaginary component {}'.format(m))
|
54 |
+
covmean = covmean.real
|
55 |
+
|
56 |
+
tr_covmean = np.trace(covmean)
|
57 |
+
|
58 |
+
return (diff.dot(diff) + np.trace(sigma1)
|
59 |
+
+ np.trace(sigma2) - 2 * tr_covmean)
|
60 |
+
|
61 |
+
def calculate_activation_statistics(data):
|
62 |
+
"""Calculation of the statistics used by the FID.
|
63 |
+
Params:
|
64 |
+
-- files : List of image files paths
|
65 |
+
-- model : Instance of inception model
|
66 |
+
-- batch_size : The images numpy array is split into batches with
|
67 |
+
batch size batch_size. A reasonable batch size
|
68 |
+
depends on the hardware.
|
69 |
+
-- dims : Dimensionality of features returned by Inception
|
70 |
+
-- device : Device to run calculations
|
71 |
+
-- num_workers : Number of parallel dataloader workers
|
72 |
+
Returns:
|
73 |
+
-- mu : The mean over samples of the activations of the pool_3 layer of
|
74 |
+
the inception model.
|
75 |
+
-- sigma : The covariance matrix of the activations of the pool_3 layer of
|
76 |
+
the inception model.
|
77 |
+
"""
|
78 |
+
mu = np.mean(data, axis=0)
|
79 |
+
sigma = np.cov(data, rowvar=False)
|
80 |
+
return mu, sigma
|
81 |
+
|
82 |
+
def calculate_diversity(data, first_indices, second_indices):
|
83 |
+
diversity = 0
|
84 |
+
|
85 |
+
d = torch.FloatTensor(data)
|
86 |
+
|
87 |
+
for first_idx, second_idx in zip(first_indices, second_indices):
|
88 |
+
diversity += torch.dist(d[first_idx, :], d[second_idx, :])
|
89 |
+
|
90 |
+
diversity /= len(first_indices)
|
91 |
+
return diversity
|
92 |
+
|
93 |
+
d = np.load("feature.npy", allow_pickle=True)[()]
|
94 |
+
|
95 |
+
d0 = d["train_data"]
|
96 |
+
d1 = d["test_data"]
|
97 |
+
d2 = d["gen_T5"]
|
98 |
+
d3 = d["gen_GRU_T5"]
|
99 |
+
d4 = d["LSTM_Des"]
|
100 |
+
d5 = d["gen"]
|
101 |
+
|
102 |
+
Mean, Std = np.mean(d0, 0), np.std(d0, 0)
|
103 |
+
d0 = [(v - Mean[None, :]) / Std[None, :] for v in d0]
|
104 |
+
d1 = [(v - Mean[None, :]) / Std[None, :] for v in d1]
|
105 |
+
d2 = [(v - Mean[None, :]) / Std[None, :] for v in d2]
|
106 |
+
d3 = [(v - Mean[None, :]) / Std[None, :] for v in d3]
|
107 |
+
d4 = [(v - Mean[None, :]) / Std[None, :] for v in d4]
|
108 |
+
d5 = [(v - Mean[None, :]) / Std[None, :] for v in d5]
|
109 |
+
|
110 |
+
if not os.path.exists("viz"):
|
111 |
+
os.mkdir("viz")
|
112 |
+
|
113 |
+
|
114 |
+
d0 = np.array([v.flatten() for v in d0])
|
115 |
+
d1 = np.array([v.flatten() for v in d1])
|
116 |
+
d2 = np.array([v.flatten() for v in d2])
|
117 |
+
d3 = np.array([v.flatten() for v in d3])
|
118 |
+
d4 = np.array([v.flatten() for v in d4])
|
119 |
+
d5 = np.array([v.flatten() for v in d5])
|
120 |
+
|
121 |
+
print("Diversity")
|
122 |
+
|
123 |
+
diversity_times = 10000
|
124 |
+
num_motions = len(d1)
|
125 |
+
first_indices = np.random.randint(0, num_motions, diversity_times)
|
126 |
+
second_indices = np.random.randint(0, num_motions, diversity_times)
|
127 |
+
|
128 |
+
print(calculate_diversity(d1, first_indices, second_indices))
|
129 |
+
print(calculate_diversity(d2, first_indices, second_indices))
|
130 |
+
print(calculate_diversity(d3, first_indices, second_indices))
|
131 |
+
print(calculate_diversity(d4, first_indices, second_indices))
|
132 |
+
print(calculate_diversity(d5, first_indices, second_indices))
|
133 |
+
|
134 |
+
print("Diversity with action label")
|
135 |
+
|
136 |
+
d = np.load("data.npy", allow_pickle=True)[()]
|
137 |
+
|
138 |
+
label = dict()
|
139 |
+
for i in range(6):
|
140 |
+
label[i] = []
|
141 |
+
for i in range(len(d['test_label'])):
|
142 |
+
label[d['test_label'][i]].append(i)
|
143 |
+
|
144 |
+
diversity_times = 1000
|
145 |
+
first_indices = []
|
146 |
+
second_indices = []
|
147 |
+
for i in range(6):
|
148 |
+
idx = np.random.randint(0, len(label[i]), diversity_times)
|
149 |
+
for j in idx:
|
150 |
+
first_indices.append(label[i][j])
|
151 |
+
idx = np.random.randint(0, len(label[i]), diversity_times)
|
152 |
+
for j in idx:
|
153 |
+
second_indices.append(label[i][j])
|
154 |
+
|
155 |
+
import random
|
156 |
+
print(random.shuffle(second_indices))
|
157 |
+
|
158 |
+
print(calculate_diversity(d1, first_indices, second_indices))
|
159 |
+
print(calculate_diversity(d2, first_indices, second_indices))
|
160 |
+
print(calculate_diversity(d3, first_indices, second_indices))
|
161 |
+
print(calculate_diversity(d4, first_indices, second_indices))
|
162 |
+
print(calculate_diversity(d5, first_indices, second_indices))
|
163 |
+
|
164 |
+
|
165 |
+
print("FID with training")
|
166 |
+
|
167 |
+
mu0, sigma0 = calculate_activation_statistics(d0)
|
168 |
+
mu1, sigma1 = calculate_activation_statistics(d1)
|
169 |
+
mu2, sigma2 = calculate_activation_statistics(d2)
|
170 |
+
mu3, sigma3 = calculate_activation_statistics(d3)
|
171 |
+
mu4, sigma4 = calculate_activation_statistics(d4)
|
172 |
+
mu5, sigma5 = calculate_activation_statistics(d5)
|
173 |
+
|
174 |
+
print(calculate_frechet_distance(mu0, sigma0, mu1, sigma1))
|
175 |
+
print(calculate_frechet_distance(mu0, sigma0, mu2, sigma2))
|
176 |
+
print(calculate_frechet_distance(mu0, sigma0, mu3, sigma3))
|
177 |
+
print(calculate_frechet_distance(mu0, sigma0, mu4, sigma4))
|
178 |
+
print(calculate_frechet_distance(mu0, sigma0, mu5, sigma5))
|
CCD/utils/rerun.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from matplotlib import colormaps
|
3 |
+
import rerun as rr
|
4 |
+
from rerun.components import Material
|
5 |
+
from scipy.spatial import transform
|
6 |
+
|
7 |
+
|
8 |
+
def color_fn(x, cmap="tab10"):
|
9 |
+
return colormaps[cmap](x % colormaps[cmap].N)
|
10 |
+
|
11 |
+
|
12 |
+
def ccd_log_sample(
|
13 |
+
root_name: str,
|
14 |
+
traj: np.ndarray,
|
15 |
+
):
|
16 |
+
|
17 |
+
traj = traj[0]
|
18 |
+
num_cameras = traj.shape[0]
|
19 |
+
|
20 |
+
rr.log(root_name, rr.ViewCoordinates.RIGHT_HAND_Y_DOWN, timeless=True)
|
21 |
+
|
22 |
+
rr.log(
|
23 |
+
f"{root_name}/trajectory/points",
|
24 |
+
rr.Points3D(traj[:, :3]),
|
25 |
+
timeless=True,
|
26 |
+
)
|
27 |
+
|
28 |
+
rr.log(
|
29 |
+
f"{root_name}/trajectory/line",
|
30 |
+
rr.LineStrips3D(
|
31 |
+
np.stack((traj[:, :3][:-1], traj[:, :3][1:]), axis=1),
|
32 |
+
colors=[(1.0, 0.0, 1.0, 1.0)], # Purple color
|
33 |
+
),
|
34 |
+
timeless=True,
|
35 |
+
)
|
36 |
+
|
37 |
+
|
38 |
+
for k in range(num_cameras):
|
39 |
+
rr.set_time_sequence("frame_idx", k)
|
40 |
+
|
41 |
+
translation = traj[k][:3]
|
42 |
+
|
43 |
+
fx = 955.02 # Focal length in X
|
44 |
+
fy = 955.02 # Focal length in Y (same as fx for 1:1 aspect ratio)
|
45 |
+
cx = 256 # Principal point X (center of 512x512 image)
|
46 |
+
cy = 256 # Principal point Y (center of 512x512 image)
|
47 |
+
K = np.array([
|
48 |
+
[fx, 0, cx],
|
49 |
+
[0, fy, cy],
|
50 |
+
[0, 0, 1]
|
51 |
+
])
|
52 |
+
|
53 |
+
rr.log(
|
54 |
+
f"{root_name}/camera/image",
|
55 |
+
rr.Pinhole(
|
56 |
+
image_from_camera=K,
|
57 |
+
width=K[0, -1] * 2,
|
58 |
+
height=K[1, -1] * 2,
|
59 |
+
),
|
60 |
+
)
|
61 |
+
|
62 |
+
width = K[0, -1] * 2
|
63 |
+
height = K[1, -1] * 2
|
64 |
+
fov_x = 2 * np.arctan(width / (2 * K[0, 0]))
|
65 |
+
fov_y = 2 * np.arctan(height / (2 * K[1, 1]))
|
66 |
+
|
67 |
+
horizontal_angle = np.arctan(traj[k][3] * np.tan(fov_x / 2))
|
68 |
+
vertical_angle = np.arctan(traj[k][4] * np.tan(fov_y / 2))
|
69 |
+
|
70 |
+
direction = -translation
|
71 |
+
direction /= np.linalg.norm(direction)
|
72 |
+
|
73 |
+
up = np.array([0, 1, 0])
|
74 |
+
|
75 |
+
right = np.cross(up, direction)
|
76 |
+
right /= np.linalg.norm(right)
|
77 |
+
up = np.cross(direction, right)
|
78 |
+
|
79 |
+
rotation_matrix = np.vstack([right, up, direction]).T
|
80 |
+
|
81 |
+
rotation_x = transform.Rotation.from_rotvec(vertical_angle * np.array([1, 0, 0]))
|
82 |
+
rotation_y = transform.Rotation.from_rotvec(-horizontal_angle * np.array([0, 1, 0]))
|
83 |
+
|
84 |
+
rotation_combined = rotation_y * transform.Rotation.from_matrix(rotation_matrix) * rotation_x
|
85 |
+
|
86 |
+
rotation_q = rotation_combined.as_quat()
|
87 |
+
|
88 |
+
rr.log(
|
89 |
+
f"{root_name}/camera",
|
90 |
+
rr.Transform3D(
|
91 |
+
translation=translation,
|
92 |
+
rotation=rr.Quaternion(xyzw=rotation_q),
|
93 |
+
),
|
94 |
+
)
|
95 |
+
|
96 |
+
rr.set_time_sequence("image", k)
|
97 |
+
|
98 |
+
rr.log(
|
99 |
+
f"{root_name}/char_traj/points",
|
100 |
+
rr.Points3D([[0, 0, 0]], colors=[(1.0, 0.0, 0.0, 1.0)]),
|
101 |
+
timeless=True,
|
102 |
+
)
|
{checkpoints β ET/checkpoints}/ca-mixed-e449.ckpt
RENAMED
File without changes
|
{configs β ET/configs}/compnode/cpu.yaml
RENAMED
File without changes
|
{configs β ET/configs}/compnode/gpu.yaml
RENAMED
File without changes
|
{configs β ET/configs}/config.yaml
RENAMED
@@ -8,9 +8,9 @@ dataset:
|
|
8 |
char:
|
9 |
load_vertices: true
|
10 |
|
11 |
-
checkpoint_path: 'checkpoints/ca-mixed-e449.ckpt'
|
12 |
batch_size: 128
|
13 |
-
data_dir: data
|
14 |
|
15 |
hydra:
|
16 |
run:
|
|
|
8 |
char:
|
9 |
load_vertices: true
|
10 |
|
11 |
+
checkpoint_path: 'ET/checkpoints/ca-mixed-e449.ckpt'
|
12 |
batch_size: 128
|
13 |
+
data_dir: ET/data
|
14 |
|
15 |
hydra:
|
16 |
run:
|
{configs β ET/configs}/dataset/caption/caption.yaml
RENAMED
File without changes
|
{configs β ET/configs}/dataset/char/char.yaml
RENAMED
File without changes
|
{configs β ET/configs}/dataset/standardization/0300.yaml
RENAMED
File without changes
|
{configs β ET/configs}/dataset/traj+caption+char.yaml
RENAMED
File without changes
|
{configs β ET/configs}/dataset/trajectory/rot6d_trajectory.yaml
RENAMED
File without changes
|
{configs β ET/configs}/diffuser/network/module/ca_director.yaml
RENAMED
File without changes
|
{configs β ET/configs}/diffuser/network/rn_director.yaml
RENAMED
File without changes
|
{configs β ET/configs}/diffuser/rn_director_edm.yaml
RENAMED
File without changes
|
{data β ET/data}/cam_segments/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/cam_segments/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/cam_segments/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/caption/2011_F_EuMeT2wBo_00014_00001.txt
RENAMED
File without changes
|
{data β ET/data}/caption/2011_KAeAqaA0Llg_00005_00001.txt
RENAMED
File without changes
|
{data β ET/data}/caption/2011_MCkKihQrNA4_00014_00000.txt
RENAMED
File without changes
|
{data β ET/data}/caption_clip/seq/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/caption_clip/seq/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/caption_clip/seq/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/caption_clip/token/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/caption_clip/token/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/caption_clip/token/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/char/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/char/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/char/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/char_raw/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/char_raw/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/char_raw/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/char_segments/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/char_segments/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/char_segments/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/demo_split.txt
RENAMED
File without changes
|
{data β ET/data}/intrinsics/2011_F_EuMeT2wBo_00014_00001.npy
RENAMED
File without changes
|
{data β ET/data}/intrinsics/2011_KAeAqaA0Llg_00005_00001.npy
RENAMED
File without changes
|
{data β ET/data}/intrinsics/2011_MCkKihQrNA4_00014_00000.npy
RENAMED
File without changes
|
{data β ET/data}/traj/2011_F_EuMeT2wBo_00014_00001.txt
RENAMED
File without changes
|