VinitT commited on
Commit
fd19cdd
·
verified ·
1 Parent(s): e805a15

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -0
app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
3
+ from PIL import Image
4
+ import torch
5
+
6
+ # Load the processor and model directly
7
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
8
+ model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
9
+
10
+ # Streamlit app
11
+ st.title("Image Description Generator")
12
+
13
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
14
+
15
+ if uploaded_file is not None:
16
+ # Open the image
17
+ image = Image.open(uploaded_file)
18
+ st.image(image, caption='Uploaded Image.', use_column_width=True)
19
+ st.write("Generating description...")
20
+
21
+ messages = [
22
+ {
23
+ "role": "user",
24
+ "content": [
25
+ {
26
+ "type": "image",
27
+ "image": image,
28
+ },
29
+ {"type": "text", "text": "Describe this image."},
30
+ ],
31
+ }
32
+ ]
33
+
34
+ # Preparation for inference
35
+ text = processor.apply_chat_template(
36
+ messages, tokenize=False, add_generation_prompt=True
37
+ )
38
+
39
+ # Pass the image to the processor
40
+ inputs = processor(
41
+ text=[text],
42
+ images=[image],
43
+ padding=True,
44
+ return_tensors="pt",
45
+ )
46
+ inputs = inputs.to("cpu")
47
+
48
+ # Inference: Generation of the output
49
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
50
+ generated_ids_trimmed = [
51
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
52
+ ]
53
+ output_text = processor.batch_decode(
54
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
55
+ )
56
+
57
+ st.write("Description:")
58
+ st.write(output_text[0])