import gradio as gr
import torch
from PIL import Image
from diffusers import AutoPipelineForText2Image, DDIMScheduler
import numpy as np
import spaces  # Make sure to import spaces

# Initialize the pipeline
pipeline = AutoPipelineForText2Image.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16
)

# Configure the scheduler for the pipeline
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)

# Load IP adapter with specified weights and set the scale for each component
pipeline.load_ip_adapter(
    "h94/IP-Adapter",
    subfolder="sdxl_models",
    weight_name=[
        "ip-adapter-plus_sdxl_vit-h.safetensors",
        "ip-adapter-plus-face_sdxl_vit-h.safetensors"
    ]
)
pipeline.set_ip_adapter_scale([0.7, 0.5])

# Define the desired size for the images
desired_size = (1024, 1024)

@spaces.GPU
def transform_image(face_image):
    # Move the pipeline to the GPU inside the function
    pipeline.to("cuda")
    generator = torch.Generator(device="cuda").manual_seed(0)

    # Process the input face image
    if isinstance(face_image, Image.Image):
        processed_face_image = face_image
    elif isinstance(face_image, np.ndarray):
        processed_face_image = Image.fromarray(face_image)
    else:
        raise ValueError("Unsupported image format")

    # Resize the face image
    processed_face_image = processed_face_image.resize(desired_size, Image.LANCZOS)

    # Convert PIL images to PyTorch tensors
    processed_face_tensor = transforms.ToTensor()(processed_face_image).unsqueeze(0).to("cuda")
    style_image_tensor = transforms.ToTensor()(style_image).unsqueeze(0).to("cuda")

    # Ensure tensors are the correct shape (C, H, W)
    if processed_face_tensor.shape[1:] != (3, 1280, 1280):
        raise ValueError(f"Face image tensor shape is {processed_face_tensor.shape}, but expected shape is (3, 1280, 1280)")

    # Perform the transformation using the configured pipeline
    image = pipeline(
        prompt="soyjak",
        ip_adapter_image=[style_image_tensor, processed_face_tensor],
        negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
        num_inference_steps=30,
        generator=generator,
    ).images[0]

    # Move the pipeline back to CPU after processing to release GPU resources
    pipeline.to("cpu")
    return transforms.ToPILImage()(image.squeeze(0))

# Gradio interface setup
demo = gr.Interface(
    fn=transform_image,
    inputs=gr.Image(label="Upload your face image"),
    outputs=gr.Image(label="Your Soyjak"),
    title="InstaSoyjak - turn anyone into a Soyjak",
    description="All you need to do is upload an image. Please use responsibly.",
)

demo.queue(max_size=20)
demo.launch()