pcuenq HF staff commited on
Commit
0dee401
1 Parent(s): a51a8cb

Add custom endpoint handler

Browse files
Files changed (3) hide show
  1. handler.py +148 -0
  2. modules.py +178 -0
  3. requirements.txt +5 -0
handler.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import base64
3
+ from io import BytesIO
4
+ from pathlib import Path
5
+ import torch
6
+ from torch import autocast
7
+ import open_clip
8
+ from open_clip import tokenizer
9
+ from rudalle import get_vae
10
+ from einops import rearrange
11
+ from PIL import Image
12
+
13
+ from modules import DenoiseUNet
14
+
15
+ # set device
16
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
+
18
+ batch_size = 1
19
+ steps = 11
20
+ scale = 5
21
+
22
+ def to_pil(images):
23
+ images = images.permute(0, 2, 3, 1).cpu().numpy()
24
+ images = (images * 255).round().astype("uint8")
25
+ images = [Image.fromarray(image) for image in images]
26
+ return images
27
+
28
+ def log(t, eps=1e-20):
29
+ return torch.log(t + eps)
30
+
31
+ def gumbel_noise(t):
32
+ noise = torch.zeros_like(t).uniform_(0, 1)
33
+ return -log(-log(noise))
34
+
35
+ def gumbel_sample(t, temperature=1., dim=-1):
36
+ return ((t / max(temperature, 1e-10)) + gumbel_noise(t)).argmax(dim=dim)
37
+
38
+ def sample(model, c, x=None, mask=None, T=12, size=(32, 32), starting_t=0, temp_range=[1.0, 1.0], typical_filtering=True, typical_mass=0.2, typical_min_tokens=1, classifier_free_scale=-1, renoise_steps=11, renoise_mode='start'):
39
+ with torch.inference_mode():
40
+ r_range = torch.linspace(0, 1, T+1)[:-1][:, None].expand(-1, c.size(0)).to(c.device)
41
+ temperatures = torch.linspace(temp_range[0], temp_range[1], T)
42
+ preds = []
43
+ if x is None:
44
+ x = torch.randint(0, model.num_labels, size=(c.size(0), *size), device=c.device)
45
+ elif mask is not None:
46
+ noise = torch.randint(0, model.num_labels, size=(c.size(0), *size), device=c.device)
47
+ x = noise * mask + (1-mask) * x
48
+ init_x = x.clone()
49
+ for i in range(starting_t, T):
50
+ if renoise_mode == 'prev':
51
+ prev_x = x.clone()
52
+ r, temp = r_range[i], temperatures[i]
53
+ logits = model(x, c, r)
54
+ if classifier_free_scale >= 0:
55
+ logits_uncond = model(x, torch.zeros_like(c), r)
56
+ logits = torch.lerp(logits_uncond, logits, classifier_free_scale)
57
+ x = logits
58
+ x_flat = x.permute(0, 2, 3, 1).reshape(-1, x.size(1))
59
+ if typical_filtering:
60
+ x_flat_norm = torch.nn.functional.log_softmax(x_flat, dim=-1)
61
+ x_flat_norm_p = torch.exp(x_flat_norm)
62
+ entropy = -(x_flat_norm * x_flat_norm_p).nansum(-1, keepdim=True)
63
+
64
+ c_flat_shifted = torch.abs((-x_flat_norm) - entropy)
65
+ c_flat_sorted, x_flat_indices = torch.sort(c_flat_shifted, descending=False)
66
+ x_flat_cumsum = x_flat.gather(-1, x_flat_indices).softmax(dim=-1).cumsum(dim=-1)
67
+
68
+ last_ind = (x_flat_cumsum < typical_mass).sum(dim=-1)
69
+ sorted_indices_to_remove = c_flat_sorted > c_flat_sorted.gather(1, last_ind.view(-1, 1))
70
+ if typical_min_tokens > 1:
71
+ sorted_indices_to_remove[..., :typical_min_tokens] = 0
72
+ indices_to_remove = sorted_indices_to_remove.scatter(1, x_flat_indices, sorted_indices_to_remove)
73
+ x_flat = x_flat.masked_fill(indices_to_remove, -float("Inf"))
74
+ # x_flat = torch.multinomial(x_flat.div(temp).softmax(-1), num_samples=1)[:, 0]
75
+ x_flat = gumbel_sample(x_flat, temperature=temp)
76
+ x = x_flat.view(x.size(0), *x.shape[2:])
77
+ if mask is not None:
78
+ x = x * mask + (1-mask) * init_x
79
+ if i < renoise_steps:
80
+ if renoise_mode == 'start':
81
+ x, _ = model.add_noise(x, r_range[i+1], random_x=init_x)
82
+ elif renoise_mode == 'prev':
83
+ x, _ = model.add_noise(x, r_range[i+1], random_x=prev_x)
84
+ else: # 'rand'
85
+ x, _ = model.add_noise(x, r_range[i+1])
86
+ preds.append(x.detach())
87
+ return preds
88
+
89
+
90
+ class EndpointHandler():
91
+ def __init__(self, path=""):
92
+ model_path = Path(path) / "model_600000.pt"
93
+ state_dict = torch.load(model_path, map_location=device)
94
+ model = DenoiseUNet(num_labels=8192).to(device)
95
+ model.load_state_dict(state_dict)
96
+ model.to(device).eval().requires_grad_()
97
+ self.model = model
98
+
99
+ vqmodel = get_vae().to(device)
100
+ vqmodel.eval().requires_grad_(False)
101
+ self.vqmodel = vqmodel
102
+
103
+ clip_model, _, _ = open_clip.create_model_and_transforms('ViT-g-14', pretrained='laion2b_s12b_b42k')
104
+ clip_model = clip_model.to(device).eval().requires_grad_(False)
105
+ self.clip_model = clip_model
106
+
107
+ def encode(self, x):
108
+ return self.vqmodel.model.encode((2 * x - 1))[-1][-1]
109
+
110
+ def decode(self, img_seq, shape=(32,32)):
111
+ img_seq = img_seq.view(img_seq.shape[0], -1)
112
+ b, n = img_seq.shape
113
+ one_hot_indices = torch.nn.functional.one_hot(img_seq, num_classes=self.vqmodel.num_tokens).float()
114
+ z = (one_hot_indices @ self.vqmodel.model.quantize.embed.weight)
115
+ z = rearrange(z, 'b (h w) c -> b c h w', h=shape[0], w=shape[1])
116
+ img = self.vqmodel.model.decode(z)
117
+ img = (img.clamp(-1., 1.) + 1) * 0.5
118
+ return img
119
+
120
+ def __call__(self, data: Any) -> List[List[Dict[str, float]]]:
121
+ """
122
+ Args:
123
+ data (:obj:):
124
+ includes the input data and the parameters for the inference.
125
+ Return:
126
+ A :obj:`dict`:. base64 encoded image
127
+ """
128
+ inputs = data.pop("inputs", data)
129
+
130
+ latent_shape = (32, 32)
131
+ tokenized_text = tokenizer.tokenize([inputs] * batch_size).to(device)
132
+ with autocast(device.type):
133
+ clip_embeddings = self.clip_model.encode_text(tokenized_text)
134
+ images = sample(
135
+ self.model, clip_embeddings, T=12, size=latent_shape, starting_t=0, temp_range=[1.0, 1.0],
136
+ typical_filtering=True, typical_mass=0.2, typical_min_tokens=1,
137
+ classifier_free_scale=scale, renoise_steps=steps, renoise_mode="start"
138
+ )
139
+ images = self.decode(images[-1], latent_shape)
140
+ images = to_pil(images)
141
+
142
+ # encode image as base 64
143
+ buffered = BytesIO()
144
+ images[0].save(buffered, format="JPEG")
145
+ img_str = base64.b64encode(buffered.getvalue())
146
+
147
+ # postprocess the prediction
148
+ return {"image": img_str.decode()}
modules.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class ModulatedLayerNorm(nn.Module):
8
+ def __init__(self, num_features, eps=1e-6, channels_first=True):
9
+ super().__init__()
10
+ self.ln = nn.LayerNorm(num_features, eps=eps)
11
+ self.gamma = nn.Parameter(torch.randn(1, 1, 1))
12
+ self.beta = nn.Parameter(torch.randn(1, 1, 1))
13
+ self.channels_first = channels_first
14
+
15
+ def forward(self, x, w=None):
16
+ x = x.permute(0, 2, 3, 1) if self.channels_first else x
17
+ if w is None:
18
+ x = self.ln(x)
19
+ else:
20
+ x = self.gamma * w * self.ln(x) + self.beta * w
21
+ x = x.permute(0, 3, 1, 2) if self.channels_first else x
22
+ return x
23
+
24
+
25
+ class ResBlock(nn.Module):
26
+ def __init__(self, c, c_hidden, c_cond=0, c_skip=0, scaler=None, layer_scale_init_value=1e-6):
27
+ super().__init__()
28
+ self.depthwise = nn.Sequential(
29
+ nn.ReflectionPad2d(1),
30
+ nn.Conv2d(c, c, kernel_size=3, groups=c)
31
+ )
32
+ self.ln = ModulatedLayerNorm(c, channels_first=False)
33
+ self.channelwise = nn.Sequential(
34
+ nn.Linear(c + c_skip, c_hidden),
35
+ nn.GELU(),
36
+ nn.Linear(c_hidden, c),
37
+ )
38
+ self.gamma = nn.Parameter(layer_scale_init_value * torch.ones(c), requires_grad=True) if layer_scale_init_value > 0 else None
39
+ self.scaler = scaler
40
+ if c_cond > 0:
41
+ self.cond_mapper = nn.Linear(c_cond, c)
42
+
43
+ def forward(self, x, s=None, skip=None):
44
+ res = x
45
+ x = self.depthwise(x)
46
+ if s is not None:
47
+ if s.size(2) == s.size(3) == 1:
48
+ s = s.expand(-1, -1, x.size(2), x.size(3))
49
+ elif s.size(2) != x.size(2) or s.size(3) != x.size(3):
50
+ s = nn.functional.interpolate(s, size=x.shape[-2:], mode='bilinear')
51
+ s = self.cond_mapper(s.permute(0, 2, 3, 1))
52
+ # s = self.cond_mapper(s.permute(0, 2, 3, 1))
53
+ # if s.size(1) == s.size(2) == 1:
54
+ # s = s.expand(-1, x.size(2), x.size(3), -1)
55
+ x = self.ln(x.permute(0, 2, 3, 1), s)
56
+ if skip is not None:
57
+ x = torch.cat([x, skip.permute(0, 2, 3, 1)], dim=-1)
58
+ x = self.channelwise(x)
59
+ x = self.gamma * x if self.gamma is not None else x
60
+ x = res + x.permute(0, 3, 1, 2)
61
+ if self.scaler is not None:
62
+ x = self.scaler(x)
63
+ return x
64
+
65
+
66
+ class DenoiseUNet(nn.Module):
67
+ def __init__(self, num_labels, c_hidden=1280, c_clip=1024, c_r=64, down_levels=[4, 8, 16], up_levels=[16, 8, 4]):
68
+ super().__init__()
69
+ self.num_labels = num_labels
70
+ self.c_r = c_r
71
+ self.down_levels = down_levels
72
+ self.up_levels = up_levels
73
+ c_levels = [c_hidden // (2 ** i) for i in reversed(range(len(down_levels)))]
74
+ self.embedding = nn.Embedding(num_labels, c_levels[0])
75
+
76
+ # DOWN BLOCKS
77
+ self.down_blocks = nn.ModuleList()
78
+ for i, num_blocks in enumerate(down_levels):
79
+ blocks = []
80
+ if i > 0:
81
+ blocks.append(nn.Conv2d(c_levels[i - 1], c_levels[i], kernel_size=4, stride=2, padding=1))
82
+ for _ in range(num_blocks):
83
+ block = ResBlock(c_levels[i], c_levels[i] * 4, c_clip + c_r)
84
+ block.channelwise[-1].weight.data *= np.sqrt(1 / sum(down_levels))
85
+ blocks.append(block)
86
+ self.down_blocks.append(nn.ModuleList(blocks))
87
+
88
+ # UP BLOCKS
89
+ self.up_blocks = nn.ModuleList()
90
+ for i, num_blocks in enumerate(up_levels):
91
+ blocks = []
92
+ for j in range(num_blocks):
93
+ block = ResBlock(c_levels[len(c_levels) - 1 - i], c_levels[len(c_levels) - 1 - i] * 4, c_clip + c_r,
94
+ c_levels[len(c_levels) - 1 - i] if (j == 0 and i > 0) else 0)
95
+ block.channelwise[-1].weight.data *= np.sqrt(1 / sum(up_levels))
96
+ blocks.append(block)
97
+ if i < len(up_levels) - 1:
98
+ blocks.append(
99
+ nn.ConvTranspose2d(c_levels[len(c_levels) - 1 - i], c_levels[len(c_levels) - 2 - i], kernel_size=4, stride=2, padding=1))
100
+ self.up_blocks.append(nn.ModuleList(blocks))
101
+
102
+ self.clf = nn.Conv2d(c_levels[0], num_labels, kernel_size=1)
103
+
104
+ def gamma(self, r):
105
+ return (r * torch.pi / 2).cos()
106
+
107
+ def add_noise(self, x, r, random_x=None):
108
+ r = self.gamma(r)[:, None, None]
109
+ mask = torch.bernoulli(r * torch.ones_like(x), )
110
+ mask = mask.round().long()
111
+ if random_x is None:
112
+ random_x = torch.randint_like(x, 0, self.num_labels)
113
+ x = x * (1 - mask) + random_x * mask
114
+ return x, mask
115
+
116
+ def gen_r_embedding(self, r, max_positions=10000):
117
+ dtype = r.dtype
118
+ r = self.gamma(r) * max_positions
119
+ half_dim = self.c_r // 2
120
+ emb = math.log(max_positions) / (half_dim - 1)
121
+ emb = torch.arange(half_dim, device=r.device).float().mul(-emb).exp()
122
+ emb = r[:, None] * emb[None, :]
123
+ emb = torch.cat([emb.sin(), emb.cos()], dim=1)
124
+ if self.c_r % 2 == 1: # zero pad
125
+ emb = nn.functional.pad(emb, (0, 1), mode='constant')
126
+ return emb.to(dtype)
127
+
128
+ def _down_encode_(self, x, s):
129
+ level_outputs = []
130
+ for i, blocks in enumerate(self.down_blocks):
131
+ for block in blocks:
132
+ if isinstance(block, ResBlock):
133
+ # s_level = s[:, 0]
134
+ # s = s[:, 1:]
135
+ x = block(x, s)
136
+ else:
137
+ x = block(x)
138
+ level_outputs.insert(0, x)
139
+ return level_outputs
140
+
141
+ def _up_decode(self, level_outputs, s):
142
+ x = level_outputs[0]
143
+ for i, blocks in enumerate(self.up_blocks):
144
+ for j, block in enumerate(blocks):
145
+ if isinstance(block, ResBlock):
146
+ # s_level = s[:, 0]
147
+ # s = s[:, 1:]
148
+ if i > 0 and j == 0:
149
+ x = block(x, s, level_outputs[i])
150
+ else:
151
+ x = block(x, s)
152
+ else:
153
+ x = block(x)
154
+ return x
155
+
156
+ def forward(self, x, c, r): # r is a uniform value between 0 and 1
157
+ r_embed = self.gen_r_embedding(r)
158
+ x = self.embedding(x).permute(0, 3, 1, 2)
159
+ if len(c.shape) == 2:
160
+ s = torch.cat([c, r_embed], dim=-1)[:, :, None, None]
161
+ else:
162
+ r_embed = r_embed[:, :, None, None].expand(-1, -1, c.size(2), c.size(3))
163
+ s = torch.cat([c, r_embed], dim=1)
164
+ level_outputs = self._down_encode_(x, s)
165
+ x = self._up_decode(level_outputs, s)
166
+ x = self.clf(x)
167
+ return x
168
+
169
+
170
+ if __name__ == '__main__':
171
+ device = "cuda"
172
+ model = DenoiseUNet(1024).to(device)
173
+ print(sum([p.numel() for p in model.parameters()]))
174
+ x = torch.randint(0, 1024, (1, 32, 32)).long().to(device)
175
+ c = torch.randn((1, 1024)).to(device)
176
+ r = torch.rand(1).to(device)
177
+ model(x, c, r)
178
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ -f https://download.pytorch.org/whl/cu116
2
+ torch
3
+ rudalle
4
+ open_clip_torch
5
+ einops