adityas2410 commited on
Commit
73d7797
1 Parent(s): be576d1

Upload 5 files

Browse files
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tasks.image_caption import image_captioning
3
+ from tasks.image_retrieval import image_retrieval
4
+ from tasks.visual_qa import visual_qa
5
+
6
+ caption_interface = gr.Interface(
7
+ fn=image_captioning,
8
+ inputs=gr.Image(type="pil", label="Upload Image"),
9
+ outputs=gr.Textbox(label="Generated Caption"),
10
+ title="Image Captioning",
11
+ description="Generate a caption for the uploaded image.",
12
+ allow_flagging="never"
13
+ )
14
+
15
+ retrieval_interface = gr.Interface(
16
+ fn=image_retrieval,
17
+ inputs=[
18
+ gr.Textbox(label="Image URL"),
19
+ gr.Textbox(label="Description Text")
20
+ ],
21
+ outputs=[
22
+ gr.Image(label="Retrieved Image"),
23
+ gr.Textbox(label="Matching Probability")
24
+ ],
25
+ title="Image Retrieval",
26
+ description="Check if the image and text match semantically.",
27
+ allow_flagging="never"
28
+ )
29
+
30
+ vqa_interface = gr.Interface(
31
+ fn=visual_qa,
32
+ inputs=[
33
+ gr.Image(type="pil", label="Upload Image"),
34
+ gr.Textbox(label="Question")
35
+ ],
36
+ outputs=gr.Textbox(label="Answer"),
37
+ title="Visual Question Answering",
38
+ description="Answer questions about the uploaded image.",
39
+ allow_flagging="never"
40
+ )
41
+
42
+ # Combine vision-langauge tasks into a tabbed interface
43
+ app = gr.TabbedInterface(
44
+ interface_list=[caption_interface, retrieval_interface, vqa_interface],
45
+ tab_names=["Image Captioning", "Image Retrieval", "Visual Q&A"]
46
+ )
47
+
48
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ torch
4
+ requests
5
+ pillow
tasks/image_caption.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, BlipForConditionalGeneration
2
+
3
+ caption_id = "Salesforce/blip-image-captioning-base"
4
+ caption_model = BlipForConditionalGeneration.from_pretrained(caption_id)
5
+ caption_processor = AutoProcessor.from_pretrained(caption_id)
6
+
7
+ def image_captioning(image):
8
+ inputs = caption_processor(image, "a photograph of", return_tensors="pt")
9
+ out = caption_model.generate(**inputs)
10
+ return caption_processor.decode(out[0], skip_special_tokens=True)
tasks/image_retrieval.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, BlipForImageTextRetrieval
2
+ from PIL import Image
3
+ import requests
4
+ import torch
5
+
6
+ retrieval_id = "Salesforce/blip-itm-base-coco"
7
+ retrieval_model = BlipForImageTextRetrieval.from_pretrained(retrieval_id)
8
+ retrieval_processor = AutoProcessor.from_pretrained(retrieval_id)
9
+
10
+ def image_retrieval(image_url, text):
11
+ try:
12
+ raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
13
+ inputs = retrieval_processor(images=raw_image, text=text, return_tensors="pt")
14
+ itm_scores = retrieval_model(**inputs)[0]
15
+ itm_score = torch.nn.functional.softmax(itm_scores, dim=1)
16
+ probability = itm_score[0][1].item()
17
+
18
+ return raw_image, f"The image and text are matched with a probability of {probability:.4f}"
19
+ except Exception as e:
20
+ return None, f"Error: {str(e)}"
21
+
tasks/visual_qa.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, BlipForQuestionAnswering
2
+
3
+ vqa_id = "Salesforce/blip-vqa-base"
4
+ vqa_model = BlipForQuestionAnswering.from_pretrained(vqa_id)
5
+ vqa_processor = AutoProcessor.from_pretrained(vqa_id)
6
+
7
+ def visual_qa(image, question):
8
+ inputs = vqa_processor(image, question, return_tensors="pt")
9
+ out = vqa_model.generate(**inputs)
10
+ return vqa_processor.decode(out[0], skip_special_tokens=True)