Files changed (1) hide show
  1. app.py +56 -32
app.py CHANGED
@@ -1,9 +1,8 @@
1
  import os
2
  import streamlit as st
3
  from huggingface_hub import login
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from PIL import Image
6
- import requests
7
  import torch
8
 
9
  # Step 1: Log in to Hugging Face with your access token from secrets
@@ -13,57 +12,82 @@ if huggingface_token:
13
  else:
14
  st.error("Hugging Face token not found. Please set it in the Secrets section.")
15
 
16
- # Step 2: Load the model and tokenizer
17
- model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" # Adjust if needed
18
  try:
19
- tokenizer = AutoTokenizer.from_pretrained(model_name)
20
- model = AutoModelForCausalLM.from_pretrained(model_name)
21
- st.success("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
22
  except Exception as e:
23
- st.error(f"Error loading model: {str(e)}")
24
 
25
  # Step 3: Create a simple Streamlit app
26
  def main():
27
  st.title("Llama 3.2 11B Vision Model")
28
  st.write("Upload an image and enter a prompt to generate output.")
29
-
30
  # Upload image
31
  image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
32
  prompt = st.text_area("Enter your prompt here:")
33
-
34
  if st.button("Generate Output"):
35
  if image_file and prompt:
36
  # Load image
37
- image = Image.open(image_file)
38
  st.image(image, caption="Uploaded Image", use_column_width=True)
39
-
40
- # Preprocess the image if needed (convert to tensor, etc.)
41
- # This depends on how the model expects the image input
42
-
43
- # Example of converting image to a format suitable for the model
44
- # Note: Adjust this part based on your model's requirements.
45
- # Here, we're just using a placeholder for the model input.
46
- # You might need to resize or normalize the image based on the model's requirements.
47
- # For example:
48
- # image_tensor = preprocess_image(image)
49
 
50
  try:
51
- # Prepare the input for the model
52
- inputs = tokenizer(prompt, return_tensors='pt')
53
-
54
- # Perform inference
55
- # Adjust the input format for the model accordingly
56
- # Here we assume the model takes a prompt and an image (adjust as necessary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  with torch.no_grad():
58
- model_output = model.generate(**inputs) # Pass image tensor if required
59
-
 
 
 
60
  # Decode the output
61
- output_text = tokenizer.decode(model_output[0], skip_special_tokens=True)
62
- st.write("Generated Output:", output_text)
 
 
 
 
 
 
 
 
63
  except Exception as e:
64
  st.error(f"Error during prediction: {str(e)}")
65
  else:
66
  st.warning("Please upload an image and enter a prompt.")
67
 
68
  if __name__ == "__main__":
69
- main()
 
1
  import os
2
  import streamlit as st
3
  from huggingface_hub import login
4
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
5
  from PIL import Image
 
6
  import torch
7
 
8
  # Step 1: Log in to Hugging Face with your access token from secrets
 
12
  else:
13
  st.error("Hugging Face token not found. Please set it in the Secrets section.")
14
 
15
+ # Step 2: Load the model and processor
 
16
  try:
17
+ model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
18
+ model = MllamaForConditionalGeneration.from_pretrained(
19
+ model_name,
20
+ use_auth_token=huggingface_token,
21
+ torch_dtype=torch.bfloat16,
22
+ device_map="auto",
23
+ )
24
+ processor = AutoProcessor.from_pretrained(
25
+ model_name,
26
+ use_auth_token=huggingface_token,
27
+ )
28
+ st.success("Model and processor loaded successfully!")
29
  except Exception as e:
30
+ st.error(f"Error loading model or processor: {str(e)}")
31
 
32
  # Step 3: Create a simple Streamlit app
33
  def main():
34
  st.title("Llama 3.2 11B Vision Model")
35
  st.write("Upload an image and enter a prompt to generate output.")
36
+
37
  # Upload image
38
  image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
39
  prompt = st.text_area("Enter your prompt here:")
40
+
41
  if st.button("Generate Output"):
42
  if image_file and prompt:
43
  # Load image
44
+ image = Image.open(image_file).convert("RGB")
45
  st.image(image, caption="Uploaded Image", use_column_width=True)
 
 
 
 
 
 
 
 
 
 
46
 
47
  try:
48
+ # Prepare the messages in the format expected by the processor
49
+ messages = [
50
+ {
51
+ "role": "user",
52
+ "content": [
53
+ {"type": "text", "text": prompt},
54
+ {"type": "image"}
55
+ ]
56
+ }
57
+ ]
58
+
59
+ # Apply chat template
60
+ input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
61
+
62
+ # Prepare inputs for the model
63
+ inputs = processor(
64
+ text=input_text,
65
+ images=[image],
66
+ return_tensors="pt"
67
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
68
+
69
+ # Generate output
70
  with torch.no_grad():
71
+ output_ids = model.generate(
72
+ **inputs,
73
+ max_new_tokens=250,
74
+ )
75
+
76
  # Decode the output
77
+ output_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
78
+
79
+ # Extract the generated response
80
+ # Remove the prompt part from the output_text
81
+ if input_text in output_text:
82
+ generated_output = output_text.replace(input_text, "").strip()
83
+ else:
84
+ generated_output = output_text.strip()
85
+
86
+ st.write("Generated Output:", generated_output)
87
  except Exception as e:
88
  st.error(f"Error during prediction: {str(e)}")
89
  else:
90
  st.warning("Please upload an image and enter a prompt.")
91
 
92
  if __name__ == "__main__":
93
+ main()