Spaces:

bkhmsi
/

Font-To-Sketch

Runtime error

Badr AlKhamissi

starting space

913d3e3 over 1 year ago

8.69 kB

	import torch.nn as nn
	import torchvision
	from scipy.spatial import Delaunay
	import torch
	import numpy as np
	from torch.nn import functional as nnf
	from easydict import EasyDict
	from shapely.geometry import Point
	from shapely.geometry.polygon import Polygon
	from torchvision import transforms
	from PIL import Image
	from transformers import CLIPProcessor, CLIPModel

	from diffusers import StableDiffusionPipeline

	class SDSLoss(nn.Module):
	def __init__(self, cfg, device):
	super(SDSLoss, self).__init__()
	self.cfg = cfg
	self.device = device
	self.pipe = StableDiffusionPipeline.from_pretrained(cfg.diffusion.model,
	torch_dtype=torch.float16, use_auth_token=cfg.token)
	self.pipe = self.pipe.to(self.device)

	self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device)
	self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

	# default scheduler: PNDMScheduler(beta_start=0.00085, beta_end=0.012,
	# beta_schedule="scaled_linear", num_train_timesteps=1000)
	self.alphas = self.pipe.scheduler.alphas_cumprod.to(self.device)
	self.sigmas = (1 - self.pipe.scheduler.alphas_cumprod).to(self.device)

	self.text_embeddings = None
	self.embed_text()

	def embed_text(self):
	# tokenizer and embed text

	if "jpeg" not in self.cfg.caption:
	text_input = self.pipe.tokenizer(self.cfg.caption, padding="max_length",
	max_length=self.pipe.tokenizer.model_max_length,
	truncation=True, return_tensors="pt")
	uncond_input = self.pipe.tokenizer([""], padding="max_length",
	max_length=text_input.input_ids.shape[-1],
	return_tensors="pt")
	with torch.no_grad():
	text_embeddings = self.pipe.text_encoder(text_input.input_ids.to(self.device))[0]
	uncond_embeddings = self.pipe.text_encoder(uncond_input.input_ids.to(self.device))[0]
	else:
	print(f"> Reading Image {self.cfg.caption}")
	with torch.no_grad():
	image = Image.open(self.cfg.caption)
	inputs = self.clip_processor(images=image, return_tensors="pt").to(self.device)
	img_emb = self.clip_model.get_image_features(**inputs)
	text_embeddings = img_emb
	uncond_embeddings = img_emb

	print(text_embeddings.size())
	print(uncond_embeddings.size())
	self.text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
	self.text_embeddings = self.text_embeddings.repeat_interleave(self.cfg.batch_size, 0)
	del self.pipe.tokenizer
	del self.pipe.text_encoder


	def forward(self, x_aug):
	sds_loss = 0

	# encode rendered image
	x = x_aug * 2. - 1.
	with torch.cuda.amp.autocast():
	init_latent_z = (self.pipe.vae.encode(x).latent_dist.sample())
	latent_z = 0.18215 * init_latent_z # scaling_factor * init_latents

	with torch.inference_mode():
	# sample timesteps
	timestep = torch.randint(
	low=50,
	high=min(950, self.cfg.diffusion.timesteps) - 1, # avoid highest timestep \| diffusion.timesteps=1000
	size=(latent_z.shape[0],),
	device=self.device, dtype=torch.long)

	# add noise
	eps = torch.randn_like(latent_z)
	# zt = alpha_t * latent_z + sigma_t * eps
	noised_latent_zt = self.pipe.scheduler.add_noise(latent_z, eps, timestep)

	# denoise
	z_in = torch.cat([noised_latent_zt] * 2) # expand latents for classifier free guidance
	timestep_in = torch.cat([timestep] * 2)
	with torch.autocast(device_type="cuda", dtype=torch.float16):
	eps_t_uncond, eps_t = self.pipe.unet(z_in, timestep, encoder_hidden_states=self.text_embeddings).sample.float().chunk(2)

	eps_t = eps_t_uncond + self.cfg.diffusion.guidance_scale * (eps_t - eps_t_uncond)

	# w = alphas[timestep]^0.5 * (1 - alphas[timestep]) = alphas[timestep]^0.5 * sigmas[timestep]
	grad_z = self.alphas[timestep]*0.5 self.sigmas[timestep] * (eps_t - eps)
	assert torch.isfinite(grad_z).all()
	grad_z = torch.nan_to_num(grad_z.detach().float(), 0.0, 0.0, 0.0)

	sds_loss = grad_z.clone() * latent_z
	del grad_z

	sds_loss = sds_loss.sum(1).mean()
	return sds_loss


	class ToneLoss(nn.Module):
	def __init__(self, cfg):
	super(ToneLoss, self).__init__()
	self.dist_loss_weight = cfg.loss.tone.dist_loss_weight
	self.im_init = None
	self.cfg = cfg
	self.mse_loss = nn.MSELoss()
	self.blurrer = torchvision.transforms.GaussianBlur(kernel_size=(cfg.loss.tone.pixel_dist_kernel_blur,
	cfg.loss.tone.pixel_dist_kernel_blur), sigma=(cfg.loss.tone.pixel_dist_sigma))

	def set_image_init(self, im_init):
	self.im_init = im_init.permute(2, 0, 1).unsqueeze(0)
	self.init_blurred = self.blurrer(self.im_init)


	def get_scheduler(self, step=None):
	if step is not None:
	return self.dist_loss_weight * np.exp(-(1/5)((step-300)/(20)) * 2)
	else:
	return self.dist_loss_weight

	def forward(self, cur_raster, step=None):
	blurred_cur = self.blurrer(cur_raster)
	return self.mse_loss(self.init_blurred.detach(), blurred_cur) * self.get_scheduler(step)


	class ConformalLoss:
	def __init__(self, parameters: EasyDict, device: torch.device, target_letter: str, shape_groups):
	self.parameters = parameters
	self.target_letter = target_letter
	self.shape_groups = shape_groups
	self.faces = self.init_faces(device)
	self.faces_roll_a = [torch.roll(self.faces[i], 1, 1) for i in range(len(self.faces))]

	with torch.no_grad():
	self.angles = []
	self.reset()


	def get_angles(self, points: torch.Tensor) -> torch.Tensor:
	angles_ = []
	for i in range(len(self.faces)):
	triangles = points[self.faces[i]]
	triangles_roll_a = points[self.faces_roll_a[i]]
	edges = triangles_roll_a - triangles
	length = edges.norm(dim=-1)
	edges = edges / (length + 1e-1)[:, :, None]
	edges_roll = torch.roll(edges, 1, 1)
	cosine = torch.einsum('ned,ned->ne', edges, edges_roll)
	angles = torch.arccos(cosine)
	angles_.append(angles)
	return angles_

	def get_letter_inds(self, letter_to_insert):
	for group, l in zip(self.shape_groups, self.target_letter):
	if l == letter_to_insert:
	letter_inds = group.shape_ids
	return letter_inds[0], letter_inds[-1], len(letter_inds)

	def reset(self):
	points = torch.cat([point.clone().detach() for point in self.parameters.point])
	self.angles = self.get_angles(points)

	def init_faces(self, device: torch.device) -> torch.tensor:
	faces_ = []
	for j, c in enumerate(self.target_letter):
	points_np = [self.parameters.point[i].clone().detach().cpu().numpy() for i in range(len(self.parameters.point))]
	start_ind, end_ind, shapes_per_letter = self.get_letter_inds(c)
	print(c, start_ind, end_ind)
	holes = []
	if shapes_per_letter > 1:
	holes = points_np[start_ind+1:end_ind]
	poly = Polygon(points_np[start_ind], holes=holes)
	poly = poly.buffer(0)
	points_np = np.concatenate(points_np)
	faces = Delaunay(points_np).simplices
	is_intersect = np.array([poly.contains(Point(points_np[face].mean(0))) for face in faces], dtype=np.bool_)
	faces_.append(torch.from_numpy(faces[is_intersect]).to(device, dtype=torch.int64))
	return faces_

	def __call__(self) -> torch.Tensor:
	loss_angles = 0
	points = torch.cat(self.parameters.point)
	angles = self.get_angles(points)
	for i in range(len(self.faces)):
	loss_angles += (nnf.mse_loss(angles[i], self.angles[i]))
	return loss_angles