Ryukijano commited on
Commit
c0583a3
1 Parent(s): 6b90443

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -13
app.py CHANGED
@@ -1,7 +1,7 @@
1
- # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, PaliGemma Segmentation, and Diffusion Model
2
  import gradio as gr
3
  import spaces # Import the spaces module to use GPU-specific decorators
4
- from transformers import PaliGemmaForConditionalGeneration, AutoProcessor, pipeline
5
  from diffusers import StableDiffusionPipeline
6
  import torch
7
  import os
@@ -10,18 +10,18 @@ from PIL import Image
10
  # Set up Hugging Face token for private model access
11
  hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
12
 
13
- # Set up Meta Llama 3.2 Vision model (using private model with token)
14
- llama_vision_model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
15
- vision_model = PaliGemmaForConditionalGeneration.from_pretrained(
16
  llama_vision_model_id,
17
  torch_dtype=torch.bfloat16,
18
  device_map="auto",
19
  token=hf_token # Updated to use 'token' instead of 'use_auth_token'
20
  )
21
- processor = AutoProcessor.from_pretrained(llama_vision_model_id, token=hf_token)
22
 
23
- # Set up segmentation model using PaliGemma from Hugging Face Hub
24
- segment_model_id = "google/paligemma-3b-mix-224"
25
  segment_pipe = pipeline(
26
  "image-segmentation",
27
  model=segment_model_id,
@@ -40,12 +40,11 @@ diffusion_pipe = diffusion_pipe.to("cuda") # Force usage of GPU
40
  @spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
41
  def process_image(image):
42
  # Step 1: Use Vision model for initial image understanding (captioning)
43
- prompt = "<|image|><|begin_of_text|>Describe the image."
44
- inputs = processor(image, prompt, return_tensors="pt").to(vision_model.device)
45
- output = vision_model.generate(**inputs, max_new_tokens=50)
46
- caption = processor.decode(output[0], skip_special_tokens=True)
47
 
48
- # Step 2: Segment important parts of the image using PaliGemma
49
  segmented_result = segment_pipe(image=image)
50
  segments = segmented_result
51
 
 
1
+ # app.py for Hugging Face Space: Connecting Meta Llama 3.2 Vision, Efficient Segmentation, and Diffusion Model
2
  import gradio as gr
3
  import spaces # Import the spaces module to use GPU-specific decorators
4
+ from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, pipeline
5
  from diffusers import StableDiffusionPipeline
6
  import torch
7
  import os
 
10
  # Set up Hugging Face token for private model access
11
  hf_token = os.getenv("HF_TOKEN") # Fetch token from repository secrets
12
 
13
+ # Set up Meta Llama 3.2 Vision model (using Vision Encoder-Decoder model with token)
14
+ llama_vision_model_id = "nlpconnect/vit-gpt2-image-captioning"
15
+ vision_model = VisionEncoderDecoderModel.from_pretrained(
16
  llama_vision_model_id,
17
  torch_dtype=torch.bfloat16,
18
  device_map="auto",
19
  token=hf_token # Updated to use 'token' instead of 'use_auth_token'
20
  )
21
+ feature_extractor = AutoFeatureExtractor.from_pretrained(llama_vision_model_id, token=hf_token)
22
 
23
+ # Set up segmentation model using an efficient publicly available model
24
+ segment_model_id = "facebook/detr-resnet-50"
25
  segment_pipe = pipeline(
26
  "image-segmentation",
27
  model=segment_model_id,
 
40
  @spaces.GPU(duration=120) # Allocates GPU for a maximum of 120 seconds
41
  def process_image(image):
42
  # Step 1: Use Vision model for initial image understanding (captioning)
43
+ pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(vision_model.device)
44
+ output_ids = vision_model.generate(pixel_values, max_length=50)
45
+ caption = vision_model.config.decoder.tokenizer.decode(output_ids[0], skip_special_tokens=True)
 
46
 
47
+ # Step 2: Segment important parts of the image using DETR
48
  segmented_result = segment_pipe(image=image)
49
  segments = segmented_result
50