Spaces:

fallenshock
/

FlowEdit

Running on Zero

FlowEdit / FlowEdit_utils.py

fallenshock

added files

336dbcf 5 days ago

13.9 kB

	from typing import Optional, Tuple, Union
	import torch
	from diffusers import FlowMatchEulerDiscreteScheduler
	from tqdm import tqdm
	import numpy as np

	from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import retrieve_timesteps



	def scale_noise(
	scheduler,
	sample: torch.FloatTensor,
	timestep: Union[float, torch.FloatTensor],
	noise: Optional[torch.FloatTensor] = None,
	) -> torch.FloatTensor:
	"""
	Foward process in flow-matching

	Args:
	sample (`torch.FloatTensor`):
	The input sample.
	timestep (`int`, optional):
	The current timestep in the diffusion chain.

	Returns:
	`torch.FloatTensor`:
	A scaled input sample.
	"""
	# if scheduler.step_index is None:
	scheduler._init_step_index(timestep)

	sigma = scheduler.sigmas[scheduler.step_index]
	sample = sigma * noise + (1.0 - sigma) * sample

	return sample


	# for flux
	def calculate_shift(
	image_seq_len,
	base_seq_len: int = 256,
	max_seq_len: int = 4096,
	base_shift: float = 0.5,
	max_shift: float = 1.16,
	):
	m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
	b = base_shift - m * base_seq_len
	mu = image_seq_len * m + b
	return mu



	def calc_v_sd3(pipe, src_tar_latent_model_input, src_tar_prompt_embeds, src_tar_pooled_prompt_embeds, src_guidance_scale, tar_guidance_scale, t):
	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timestep = t.expand(src_tar_latent_model_input.shape[0])
	# joint_attention_kwargs = {}
	# # add timestep to joint_attention_kwargs
	# joint_attention_kwargs["timestep"] = timestep[0]
	# joint_attention_kwargs["timestep_idx"] = i


	with torch.no_grad():
	# # predict the noise for the source prompt
	noise_pred_src_tar = pipe.transformer(
	hidden_states=src_tar_latent_model_input,
	timestep=timestep,
	encoder_hidden_states=src_tar_prompt_embeds,
	pooled_projections=src_tar_pooled_prompt_embeds,
	joint_attention_kwargs=None,
	return_dict=False,
	)[0]

	# perform guidance source
	if pipe.do_classifier_free_guidance:
	src_noise_pred_uncond, src_noise_pred_text, tar_noise_pred_uncond, tar_noise_pred_text = noise_pred_src_tar.chunk(4)
	noise_pred_src = src_noise_pred_uncond + src_guidance_scale * (src_noise_pred_text - src_noise_pred_uncond)
	noise_pred_tar = tar_noise_pred_uncond + tar_guidance_scale * (tar_noise_pred_text - tar_noise_pred_uncond)

	return noise_pred_src, noise_pred_tar



	def calc_v_flux(pipe, latents, prompt_embeds, pooled_prompt_embeds, guidance, text_ids, latent_image_ids, t):
	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timestep = t.expand(latents.shape[0])
	# joint_attention_kwargs = {}
	# # add timestep to joint_attention_kwargs
	# joint_attention_kwargs["timestep"] = timestep[0]
	# joint_attention_kwargs["timestep_idx"] = i


	with torch.no_grad():
	# # predict the noise for the source prompt
	noise_pred = pipe.transformer(
	hidden_states=latents,
	timestep=timestep / 1000,
	guidance=guidance,
	encoder_hidden_states=prompt_embeds,
	txt_ids=text_ids,
	img_ids=latent_image_ids,
	pooled_projections=pooled_prompt_embeds,
	joint_attention_kwargs=None,
	return_dict=False,
	)[0]

	return noise_pred



	@torch.no_grad()
	def FlowEditSD3(pipe,
	scheduler,
	x_src,
	src_prompt,
	tar_prompt,
	negative_prompt,
	T_steps: int = 50,
	n_avg: int = 1,
	src_guidance_scale: float = 3.5,
	tar_guidance_scale: float = 13.5,
	n_min: int = 0,
	n_max: int = 15,):

	device = x_src.device

	timesteps, T_steps = retrieve_timesteps(scheduler, T_steps, device, timesteps=None)

	num_warmup_steps = max(len(timesteps) - T_steps * scheduler.order, 0)
	pipe._num_timesteps = len(timesteps)
	pipe._guidance_scale = src_guidance_scale

	# src prompts
	(
	src_prompt_embeds,
	src_negative_prompt_embeds,
	src_pooled_prompt_embeds,
	src_negative_pooled_prompt_embeds,
	) = pipe.encode_prompt(
	prompt=src_prompt,
	prompt_2=None,
	prompt_3=None,
	negative_prompt=negative_prompt,
	do_classifier_free_guidance=pipe.do_classifier_free_guidance,
	device=device,
	)

	# tar prompts
	pipe._guidance_scale = tar_guidance_scale
	(
	tar_prompt_embeds,
	tar_negative_prompt_embeds,
	tar_pooled_prompt_embeds,
	tar_negative_pooled_prompt_embeds,
	) = pipe.encode_prompt(
	prompt=tar_prompt,
	prompt_2=None,
	prompt_3=None,
	negative_prompt=negative_prompt,
	do_classifier_free_guidance=pipe.do_classifier_free_guidance,
	device=device,
	)

	# CFG prep
	src_tar_prompt_embeds = torch.cat([src_negative_prompt_embeds, src_prompt_embeds, tar_negative_prompt_embeds, tar_prompt_embeds], dim=0)
	src_tar_pooled_prompt_embeds = torch.cat([src_negative_pooled_prompt_embeds, src_pooled_prompt_embeds, tar_negative_pooled_prompt_embeds, tar_pooled_prompt_embeds], dim=0)

	# initialize our ODE Zt_edit_1=x_src
	zt_edit = x_src.clone()

	for i, t in tqdm(enumerate(timesteps)):

	if T_steps - i > n_max:
	continue

	t_i = t/1000
	if i+1 < len(timesteps):
	t_im1 = (timesteps[i+1])/1000
	else:
	t_im1 = torch.zeros_like(t_i).to(t_i.device)

	if T_steps - i > n_min:

	# Calculate the average of the V predictions
	V_delta_avg = torch.zeros_like(x_src)
	for k in range(n_avg):

	fwd_noise = torch.randn_like(x_src).to(x_src.device)

	zt_src = (1-t_i)x_src + (t_i)fwd_noise

	zt_tar = zt_edit + zt_src - x_src

	src_tar_latent_model_input = torch.cat([zt_src, zt_src, zt_tar, zt_tar]) if pipe.do_classifier_free_guidance else (zt_src, zt_tar)

	Vt_src, Vt_tar = calc_v_sd3(pipe, src_tar_latent_model_input,src_tar_prompt_embeds, src_tar_pooled_prompt_embeds, src_guidance_scale, tar_guidance_scale, t)

	V_delta_avg += (1/n_avg) * (Vt_tar - Vt_src) # - (hfg-1)*( x_src))

	# propagate direct ODE
	zt_edit = zt_edit.to(torch.float32)

	zt_edit = zt_edit + (t_im1 - t_i) * V_delta_avg

	zt_edit = zt_edit.to(V_delta_avg.dtype)

	else: # i >= T_steps-n_min # regular sampling for last n_min steps

	if i == T_steps-n_min:
	# initialize SDEDIT-style generation phase
	fwd_noise = torch.randn_like(x_src).to(x_src.device)
	xt_src = scale_noise(scheduler, x_src, t, noise=fwd_noise)
	xt_tar = zt_edit + xt_src - x_src

	src_tar_latent_model_input = torch.cat([xt_tar, xt_tar, xt_tar, xt_tar]) if pipe.do_classifier_free_guidance else (xt_src, xt_tar)

	_, noise_pred_tar = calc_v_sd3(pipe, src_tar_latent_model_input,src_tar_prompt_embeds, src_tar_pooled_prompt_embeds, src_guidance_scale, tar_guidance_scale, t)

	xt_tar = xt_tar.to(torch.float32)

	prev_sample = xt_tar + (t_im1 - t_im1) * (noise_pred_tar)

	prev_sample = prev_sample.to(noise_pred_tar.dtype)

	xt_tar = prev_sample

	return zt_edit if n_min == 0 else xt_tar



	@torch.no_grad()
	def FlowEditFLUX(pipe,
	scheduler,
	x_src,
	src_prompt,
	tar_prompt,
	negative_prompt,
	T_steps: int = 28,
	n_avg: int = 1,
	src_guidance_scale: float = 1.5,
	tar_guidance_scale: float = 5.5,
	n_min: int = 0,
	n_max: int = 24,):

	device = x_src.device
	orig_height, orig_width = x_src.shape[2]pipe.vae_scale_factor//2, x_src.shape[3]pipe.vae_scale_factor//2
	num_channels_latents = pipe.transformer.config.in_channels // 4

	pipe.check_inputs(
	prompt=src_prompt,
	prompt_2=None,
	height=orig_height,
	width=orig_width,
	callback_on_step_end_tensor_inputs=None,
	max_sequence_length=512,
	)

	x_src, latent_src_image_ids = pipe.prepare_latents(batch_size= x_src.shape[0], num_channels_latents=num_channels_latents, height=orig_height, width=orig_width, dtype=x_src.dtype, device=x_src.device, generator=None,latents=x_src)
	x_src_packed = pipe._pack_latents(x_src, x_src.shape[0], num_channels_latents, x_src.shape[2], x_src.shape[3])
	latent_tar_image_ids = latent_src_image_ids

	# 5. Prepare timesteps
	sigmas = np.linspace(1.0, 1 / T_steps, T_steps)
	image_seq_len = x_src_packed.shape[1]
	mu = calculate_shift(
	image_seq_len,
	scheduler.config.base_image_seq_len,
	scheduler.config.max_image_seq_len,
	scheduler.config.base_shift,
	scheduler.config.max_shift,
	)
	timesteps, T_steps = retrieve_timesteps(
	scheduler,
	T_steps,
	device,
	timesteps=None,
	sigmas=sigmas,
	mu=mu,
	)

	num_warmup_steps = max(len(timesteps) - T_steps * pipe.scheduler.order, 0)
	pipe._num_timesteps = len(timesteps)


	# src prompts
	(
	src_prompt_embeds,
	src_pooled_prompt_embeds,
	src_text_ids,

	) = pipe.encode_prompt(
	prompt=src_prompt,
	prompt_2=None,
	device=device,
	)

	# tar prompts
	pipe._guidance_scale = tar_guidance_scale
	(
	tar_prompt_embeds,
	tar_pooled_prompt_embeds,
	tar_text_ids,
	) = pipe.encode_prompt(
	prompt=tar_prompt,
	prompt_2=None,
	device=device,
	)

	# handle guidance
	if pipe.transformer.config.guidance_embeds:
	src_guidance = torch.tensor([src_guidance_scale], device=device)
	src_guidance = src_guidance.expand(x_src_packed.shape[0])
	tar_guidance = torch.tensor([tar_guidance_scale], device=device)
	tar_guidance = tar_guidance.expand(x_src_packed.shape[0])
	else:
	src_guidance = None
	tar_guidance = None

	# initialize our ODE Zt_edit_1=x_src
	zt_edit = x_src_packed.clone()

	for i, t in tqdm(enumerate(timesteps)):

	if T_steps - i > n_max:
	continue

	scheduler._init_step_index(t)
	t_i = scheduler.sigmas[scheduler.step_index]
	if i < len(timesteps):
	t_im1 = scheduler.sigmas[scheduler.step_index + 1]
	else:
	t_im1 = t_i

	if T_steps - i > n_min:

	# Calculate the average of the V predictions
	V_delta_avg = torch.zeros_like(x_src_packed)

	for k in range(n_avg):


	fwd_noise = torch.randn_like(x_src_packed).to(x_src_packed.device)

	zt_src = (1-t_i)x_src_packed + (t_i)fwd_noise

	zt_tar = zt_edit + zt_src - x_src_packed

	# Merge in the future to avoid double computation
	Vt_src = calc_v_flux(pipe,
	latents=zt_src,
	prompt_embeds=src_prompt_embeds,
	pooled_prompt_embeds=src_pooled_prompt_embeds,
	guidance=src_guidance,
	text_ids=src_text_ids,
	latent_image_ids=latent_src_image_ids,
	t=t)

	Vt_tar = calc_v_flux(pipe,
	latents=zt_tar,
	prompt_embeds=tar_prompt_embeds,
	pooled_prompt_embeds=tar_pooled_prompt_embeds,
	guidance=tar_guidance,
	text_ids=tar_text_ids,
	latent_image_ids=latent_tar_image_ids,
	t=t)

	V_delta_avg += (1/n_avg) * (Vt_tar - Vt_src) # - (hfg-1)*( x_src))

	# propagate direct ODE
	zt_edit = zt_edit.to(torch.float32)

	zt_edit = zt_edit + (t_im1 - t_i) * V_delta_avg

	zt_edit = zt_edit.to(V_delta_avg.dtype)

	else: # i >= T_steps-n_min # regular sampling last n_min steps

	if i == T_steps-n_min:
	# initialize SDEDIT-style generation phase
	fwd_noise = torch.randn_like(x_src_packed).to(x_src_packed.device)
	xt_src = scale_noise(scheduler, x_src_packed, t, noise=fwd_noise)
	xt_tar = zt_edit + xt_src - x_src_packed

	Vt_tar = calc_v_flux(pipe,
	latents=xt_tar,
	prompt_embeds=tar_prompt_embeds,
	pooled_prompt_embeds=tar_pooled_prompt_embeds,
	guidance=tar_guidance,
	text_ids=tar_text_ids,
	latent_image_ids=latent_tar_image_ids,
	t=t)


	xt_tar = xt_tar.to(torch.float32)

	prev_sample = xt_tar + (t_im1 - t_i) * (Vt_tar)

	prev_sample = prev_sample.to(Vt_tar.dtype)
	xt_tar = prev_sample
	out = zt_edit if n_min == 0 else xt_tar
	unpacked_out = pipe._unpack_latents(out, orig_height, orig_width, pipe.vae_scale_factor)
	return unpacked_out