# app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Segment Anything 2, and Diffusion Model import gradio as gr import spaces # Import the spaces module to use GPU-specific decorators from transformers import MllamaForConditionalGeneration, AutoProcessor, pipeline from diffusers import StableDiffusionPipeline import torch import os from PIL import Image # Set up Hugging Face token for private model access hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets # Set up Meta Llama 3.2 Vision model (using private model with token) llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct" vision_model = MllamaForConditionalGeneration.from_pretrained( llama_vision_model_id, torch_dtype=torch.bfloat16, device_map="auto", token=hf_token # Updated to use 'token' instead of 'use_auth_token' ) processor = AutoProcessor.from_pretrained(llama_vision_model_id, token=hf_token) # Set up segmentation model using MaskFormer Swin Large from Hugging Face Hub segment_model_id = "facebook/maskformer-swin-large" segment_pipe = pipeline( "image-segmentation", model=segment_model_id, device=0, # Force usage of GPU token=hf_token # Updated to use 'token' ) # Set up Stable Diffusion Lite model stable_diffusion_model_id = "runwayml/stable-diffusion-v1-5" diffusion_pipe = StableDiffusionPipeline.from_pretrained( stable_diffusion_model_id, torch_dtype=torch.float16, token=hf_token # Updated to use 'token' ) diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU # Use the GPU decorator for the function that needs GPU access @spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds def process_image(image): # Step 1: Use Vision model for initial image understanding (captioning) prompt = "<|image|><|begin_of_text|>Describe the image." inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device) output = vision_model.generate(**inputs, max_new_tokens=50) caption = processor.decode(output[0], skip_special_tokens=True) # Step 2: Segment important parts of the image using MaskFormer Swin Large segmented_result = segment_pipe(image=image) segments = segmented_result # Step 3: Modify segmented image using Diffusion model # Here, we modify based on the caption result and segmented area output_image = diffusion_pipe(prompt=f"Modify the {caption}", image=image).images[0] return output_image # Create Gradio interface interface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs="image", live=True, # Allow for dynamic updates if necessary allow_flagging="never", # Disallow flagging to keep interactions light title="Image Processor: Vision, Segmentation, and Modification", description="Upload an image to generate a caption, segment important parts, and modify the image using Stable Diffusion." ) # Launch the app interface.launch()