# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model
import gradio as gr
import spaces  # Import the spaces module to use GPU-specific decorators
from transformers import MllamaForConditionalGeneration, AutoProcessor, pipeline
from diffusers import StableDiffusionPipeline
import torch
import os
from PIL import Image

# Set up Hugging Face token for private model access
hf_token = os.getenv("HF_TOKEN")  # Fetch token from repository secrets

# Set up Meta Llama 3.2 Vision model (using private model with token)
llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
vision_model = MllamaForConditionalGeneration.from_pretrained(
    llama_vision_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token=hf_token  # Updated to use 'token' instead of 'use_auth_token'
)
processor = AutoProcessor.from_pretrained(llama_vision_model_id, token=hf_token)

# Set up segmentation model using MaskFormer Swin Large from Hugging Face Hub
segment_model_id = "facebook/maskformer-swin-large"
segment_pipe = pipeline(
    "image-segmentation",
    model=segment_model_id,
    device=0,  # Force usage of GPU
    token=hf_token  # Updated to use 'token'
)

# Set up Stable Diffusion Lite model
stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5"
diffusion_pipe = StableDiffusionPipeline.from_pretrained(
    stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token  # Updated to use 'token'
)
diffusion_pipe = diffusion_pipe.to("cuda")  # Force usage of GPU

# Use the GPU decorator for the function that needs GPU access
@spaces.GPU(duration=120)  # Allocates GPU for a maximum of 120 seconds
def process_image(image):
    # Step 1: Use Vision model for initial image understanding (captioning)
    prompt = "<|image|><|begin_of_text|>Describe the image."
    inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
    output = vision_model.generate(**inputs, max_new_tokens=50)
    caption = processor.decode(output[0], skip_special_tokens=True)

    # Step 2: Segment important parts of the image using MaskFormer Swin Large
    segmented_result = segment_pipe(image=image)
    segments = segmented_result

    # Step 3: Modify segmented image using Diffusion model
    # Here, we modify based on the caption result and segmented area
    output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0]
    
    return output_image

# Create Gradio interface
interface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs="image",
    live=True,  # Allow for dynamic updates if necessary
    allow_flagging="never",  # Disallow flagging to keep interactions light
    title="Image Processor: Vision, Segmentation, and Modification",
    description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion."
)

# Launch the app
interface.launch()