import gradio as gr from diffusers import StableDiffusionXLPipeline, DDIMScheduler import torch import sa_handler import math from diffusers.utils import load_image import inversion import numpy as np import spaces # init models scheduler = DDIMScheduler( beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False) pipeline = StableDiffusionXLPipeline.from_pretrained( "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True, scheduler=scheduler ).to("cuda") pipeline.enable_model_cpu_offload() pipeline.enable_vae_slicing() @spaces.GPU(duration=120) def run(ref_path, ref_style, ref_prompt, prompt1, prompt2, prompt3): # DDIM inversion src_style = f"{ref_style}" src_prompt = f"{ref_prompt}, {src_style}." image_path = f"{ref_path}" num_inference_steps = 50 x0 = np.array(load_image(image_path).resize((1024, 1024))) try: zts = inversion.ddim_inversion(pipeline, x0, src_prompt, num_inference_steps, 2) except: zts = inversion.ddim_inversion(pipeline, x0, src_prompt, num_inference_steps, 2) #mediapy.show_image(x0, title="innput reference image", height=256) # run StyleAligned prompts = [ src_prompt, prompt1, ] # some parameters you can adjust to control fidelity to reference shared_score_shift = np.log(2) # higher value induces higher fidelity, set 0 for no shift shared_score_scale = 1.0 # higher value induces higher, set 1 for no rescale # for very famouse images consider supressing attention to refference, here is a configuration example: # shared_score_shift = np.log(1) # shared_score_scale = 0.5 for i in range(1, len(prompts)): prompts[i] = f'{prompts[i]}, {src_style}.' handler = sa_handler.Handler(pipeline) sa_args = sa_handler.StyleAlignedArgs( share_group_norm=True, share_layer_norm=True, share_attention=True, adain_queries=True, adain_keys=True, adain_values=False, shared_score_shift=shared_score_shift, shared_score_scale=shared_score_scale,) handler.register(sa_args) zT, inversion_callback = inversion.make_inversion_callback(zts, offset=5) g_cpu = torch.Generator(device='cuda') g_cpu.manual_seed(10) latents = torch.randn(len(prompts), 4, 128, 128, device='cuda', generator=g_cpu, dtype=pipeline.unet.dtype,).to('cuda') latents[0] = zT images_a = pipeline(prompts, latents=latents, callback_on_step_end=inversion_callback, num_inference_steps=num_inference_steps, guidance_scale=10.0).images handler.remove() #mediapy.show_images(images_a, titles=[p[:-(len(src_style) + 3)] for p in prompts]) return images_a css = """ #col-container{ margin: 0 auto; max-width: 820px; } """ with gr.Blocks(css=css) as demo: with gr. Column(elem_id="col-container"): gr.HTML("""

Google's StyleAligned Transfer

""" ) with gr.Row(): with gr.Column(): with gr.Group(): ref_path = gr.Image(type="filepath") ref_style = gr.Textbox(label="Reference style") ref_prompt = gr.Textbox(label="Reference prompt") with gr.Column(): with gr.Group(): results = gr.Gallery() prompt1 = gr.Textbox(label="Prompt1") prompt2 = gr.Textbox(label="Prompt2") prompt3 = gr.Textbox(label="Prompt3") run_button = gr.Button("Submit") gr.Examples( examples=[ [ "./example_image/medieval-bed.jpeg", "medieval painting", "Man laying on bed", "A man working on a laptop", "A man eating pizza", "A woman playing on saxophone" ] ], inputs = [ ref_path, ref_style, ref_prompt, prompt1, prompt2, prompt3 ] ) run_button.click( fn = run, inputs = [ ref_path, ref_style, ref_prompt, prompt1, prompt2, prompt3 ], outputs = [ results ] ) demo.queue().launch()