kirill commited on
Commit
e02f821
·
1 Parent(s): ee2df8c

Added Image Captioning and Visual Q&A

Browse files
Files changed (4) hide show
  1. app.py +11 -68
  2. image_captioning.py +61 -0
  3. visual_qa.py +58 -0
  4. zero_shot_classification.py +68 -0
app.py CHANGED
@@ -1,75 +1,18 @@
1
- from transformers import CLIPModel, CLIPProcessor
2
- from PIL import Image
3
- import time
4
  import gradio as gr
5
-
6
-
7
- openai_model_name = "openai/clip-vit-large-patch14"
8
- openai_model = CLIPModel.from_pretrained(openai_model_name)
9
- openai_processor = CLIPProcessor.from_pretrained(openai_model_name)
10
-
11
- patrickjohncyh_model_name = "patrickjohncyh/fashion-clip"
12
- patrickjohncyh_model = CLIPModel.from_pretrained(patrickjohncyh_model_name)
13
- patrickjohncyh_processor = CLIPProcessor.from_pretrained(patrickjohncyh_model_name)
14
-
15
- model_map = {
16
- openai_model_name: (openai_model, openai_processor),
17
- patrickjohncyh_model_name: (patrickjohncyh_model, patrickjohncyh_processor)
18
- }
19
-
20
-
21
- def gradio_process(model_name, image, text):
22
- (model, processor) = model_map[model_name]
23
- labels = text.split(", ")
24
- print (labels)
25
- start = time.time()
26
- inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
27
- outputs = model(**inputs)
28
- probs = outputs.logits_per_image.softmax(dim=1)[0]
29
- end = time.time()
30
- time_spent = end - start
31
- probs = list(probs)
32
- results = []
33
- for i in range(len(labels)):
34
- results.append(f"{labels[i]} - {probs[i].item():.4f}")
35
- result = "\n".join(results)
36
-
37
- return [result, time_spent]
38
-
39
-
40
- with gr.Blocks() as zero_shot_image_classification_tab:
41
- gr.Markdown("# Zero-Shot Image Classification")
42
-
43
- with gr.Row():
44
- with gr.Column():
45
- # Input components
46
- input_image = gr.Image(label="Upload Image", type="pil")
47
- input_text = gr.Textbox(label="Labels (comma separated)")
48
- model_selector = gr.Dropdown([openai_model_name, patrickjohncyh_model_name],
49
- label = "Select Model")
50
-
51
- # Process button
52
- process_btn = gr.Button("Classificate")
53
-
54
- with gr.Column():
55
- # Output components
56
- elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
57
- output_text = gr.Textbox(label="Classification")
58
-
59
- # Connect the input components to the processing function
60
- process_btn.click(
61
- fn=gradio_process,
62
- inputs=[
63
- model_selector,
64
- input_image,
65
- input_text
66
- ],
67
- outputs=[output_text, elapsed_result]
68
- )
69
 
70
 
71
  with gr.Blocks() as app:
72
- gr.TabbedInterface([zero_shot_image_classification_tab], ["Zero-Shot Classification"])
 
 
 
 
 
 
 
73
 
74
 
75
  app.launch()
 
 
 
 
1
  import gradio as gr
2
+ from image_captioning import get_image_captioning_tab
3
+ from visual_qa.py import get_visual_qa_tab
4
+ from zero_shot_classification import get_zero_shot_classification_tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
  with gr.Blocks() as app:
8
+ image_captioning_tab = get_image_captioning_tab()
9
+ visual_qa_tab = get_visual_qa_tab()
10
+ zero_shot_classification_tab = get_zero_shot_classification_tab()
11
+
12
+ gr.TabbedInterface(
13
+ [image_captioning_tab, visual_qa_tab, zero_shot_classification_tab],
14
+ ["Image Captioning", "Visual Q&A", "Zero-Shot Classification"]
15
+ )
16
 
17
 
18
  app.launch()
image_captioning.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BlipForConditionalGeneration, BlipProcessor
2
+ import time
3
+ import gradio as gr
4
+
5
+
6
+ def get_image_captioning_tab():
7
+ salesforce_model_name = "Salesforce/blip-image-captioning-base"
8
+ salesforce_model = BlipForConditionalGeneration.from_pretrained(salesforce_model_name)
9
+ salesforce_processor = BlipProcessor.from_pretrained(salesforce_model_name)
10
+
11
+ noamrot_model_name = "noamrot/FuseCap_Image_Captioning"
12
+ noamrot_model = BlipForConditionalGeneration.from_pretrained(noamrot_model_name)
13
+ noamrot_processor = BlipProcessor.from_pretrained(noamrot_model_name)
14
+
15
+ model_map = {
16
+ salesforce_model_name: (salesforce_model, salesforce_processor),
17
+ noamrot_model_name: (noamrot_model, noamrot_processor)
18
+ }
19
+
20
+ def gradio_process(model_name, image, text):
21
+ (model, processor) = model_map[model_name]
22
+ start = time.time()
23
+ inputs = processor(image, text, return_tensors="pt")
24
+ out = model.generate(**inputs)
25
+ result = processor.decode(out[0], skip_special_tokens=True)
26
+ end = time.time()
27
+ time_spent = end - start
28
+
29
+ return [result, time_spent]
30
+
31
+ with gr.Blocks() as image_captioning_tab:
32
+ gr.Markdown("# Image Captioning")
33
+
34
+ with gr.Row():
35
+ with gr.Column():
36
+ # Input components
37
+ input_image = gr.Image(label="Upload Image", type="pil")
38
+ input_text = gr.Textbox(label="Caption")
39
+ model_selector = gr.Dropdown([salesforce_model_name, noamrot_model_name],
40
+ label = "Select Model")
41
+
42
+ # Process button
43
+ process_btn = gr.Button("Generate caption")
44
+
45
+ with gr.Column():
46
+ # Output components
47
+ elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
48
+ output_text = gr.Textbox(label="Generated caption")
49
+
50
+ # Connect the input components to the processing function
51
+ process_btn.click(
52
+ fn=gradio_process,
53
+ inputs=[
54
+ model_selector,
55
+ input_image,
56
+ input_text
57
+ ],
58
+ outputs=[output_text, elapsed_result]
59
+ )
60
+
61
+ return image_captioning_tab
visual_qa.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import time
3
+ import gradio as gr
4
+
5
+
6
+ def get_visual_qa_tab():
7
+ salesforce_model_name = "Salesforce/blip-vqa-base"
8
+ salesforce_pipe = pipeline("visual-question-answering", model=salesforce_model_name)
9
+
10
+ dandelin_model_name = "dandelin/vilt-b32-finetuned-vqa"
11
+ dandelin_pipe = pipeline("visual-question-answering", model=dandelin_model_name)
12
+
13
+ pipe_map = {
14
+ salesforce_model_name: salesforce_pipe,
15
+ dandelin_model_name: dandelin_pipe
16
+ }
17
+
18
+ def gradio_process(model_name, image, text):
19
+ pipe = pipe_map[model_name]
20
+ start = time.time()
21
+ output = pipe(image, text)
22
+ end = time.time()
23
+ time_spent = end - start
24
+ result = output[0]['answer']
25
+
26
+ return [result, time_spent]
27
+
28
+ with gr.Blocks() as visual_qa_tab:
29
+ gr.Markdown("# Visual Question & Answering")
30
+
31
+ with gr.Row():
32
+ with gr.Column():
33
+ # Input components
34
+ input_image = gr.Image(label="Upload Image", type="pil")
35
+ input_text = gr.Textbox(label="Question")
36
+ model_selector = gr.Dropdown([salesforce_model_name, dandelin_model_name],
37
+ label = "Select Model")
38
+
39
+ # Process button
40
+ process_btn = gr.Button("Generate answer")
41
+
42
+ with gr.Column():
43
+ # Output components
44
+ elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
45
+ output_text = gr.Textbox(label="Answer")
46
+
47
+ # Connect the input components to the processing function
48
+ process_btn.click(
49
+ fn=gradio_process,
50
+ inputs=[
51
+ model_selector,
52
+ input_image,
53
+ input_text
54
+ ],
55
+ outputs=[output_text, elapsed_result]
56
+ )
57
+
58
+ return visual_qa_tab
zero_shot_classification.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import CLIPModel, CLIPProcessor
2
+ import time
3
+ import gradio as gr
4
+
5
+
6
+ def get_zero_shot_classification_tab():
7
+ openai_model_name = "openai/clip-vit-large-patch14"
8
+ openai_model = CLIPModel.from_pretrained(openai_model_name)
9
+ openai_processor = CLIPProcessor.from_pretrained(openai_model_name)
10
+
11
+ patrickjohncyh_model_name = "patrickjohncyh/fashion-clip"
12
+ patrickjohncyh_model = CLIPModel.from_pretrained(patrickjohncyh_model_name)
13
+ patrickjohncyh_processor = CLIPProcessor.from_pretrained(patrickjohncyh_model_name)
14
+
15
+ model_map = {
16
+ openai_model_name: (openai_model, openai_processor),
17
+ patrickjohncyh_model_name: (patrickjohncyh_model, patrickjohncyh_processor)
18
+ }
19
+
20
+ def gradio_process(model_name, image, text):
21
+ (model, processor) = model_map[model_name]
22
+ labels = text.split(", ")
23
+ print (labels)
24
+ start = time.time()
25
+ inputs = processor(text=labels, images=image, return_tensors="pt", padding=True)
26
+ outputs = model(**inputs)
27
+ probs = outputs.logits_per_image.softmax(dim=1)[0]
28
+ end = time.time()
29
+ time_spent = end - start
30
+ probs = list(probs)
31
+ results = []
32
+ for i in range(len(labels)):
33
+ results.append(f"{labels[i]} - {probs[i].item():.4f}")
34
+ result = "\n".join(results)
35
+
36
+ return [result, time_spent]
37
+
38
+ with gr.Blocks() as zero_shot_image_classification_tab:
39
+ gr.Markdown("# Zero-Shot Image Classification")
40
+
41
+ with gr.Row():
42
+ with gr.Column():
43
+ # Input components
44
+ input_image = gr.Image(label="Upload Image", type="pil")
45
+ input_text = gr.Textbox(label="Labels (comma separated)")
46
+ model_selector = gr.Dropdown([openai_model_name, patrickjohncyh_model_name],
47
+ label = "Select Model")
48
+
49
+ # Process button
50
+ process_btn = gr.Button("Classificate")
51
+
52
+ with gr.Column():
53
+ # Output components
54
+ elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
55
+ output_text = gr.Textbox(label="Classification")
56
+
57
+ # Connect the input components to the processing function
58
+ process_btn.click(
59
+ fn=gradio_process,
60
+ inputs=[
61
+ model_selector,
62
+ input_image,
63
+ input_text
64
+ ],
65
+ outputs=[output_text, elapsed_result]
66
+ )
67
+
68
+ return zero_shot_image_classification_tab