ritikmishra commited on
Commit
c1d151a
·
1 Parent(s): bc4f729

Upload 4 files

Browse files
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Image Captioning with ViT+GPT2
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1_dnSI55E0UX92QMvMj5_z8sgl2WVkixA
8
+ """
9
+
10
+ #! pip install transformers -q
11
+
12
+ #! pip install gradio -q
13
+
14
+ from PIL import Image
15
+ from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
16
+ import requests
17
+
18
+ model = VisionEncoderDecoderModel.from_pretrained("sachin/vit2distilgpt2")
19
+
20
+ vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
21
+
22
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
23
+
24
+ # url = 'https://d2gp644kobdlm6.cloudfront.net/wp-content/uploads/2016/06/bigstock-Shocked-and-surprised-boy-on-t-113798588-300x212.jpg'
25
+
26
+ # with Image.open(requests.get(url, stream=True).raw) as img:
27
+ # pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
28
+
29
+ #encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams=5)
30
+
31
+ #generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
32
+
33
+ #generated_sentences
34
+
35
+ #naive text processing
36
+ #generated_sentences[0].split('.')[0]
37
+
38
+ # inference function
39
+
40
+ def vit2distilgpt2(img):
41
+ pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
42
+ encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5)
43
+ generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
44
+
45
+ return(generated_sentences[0].split('.')[0])
46
+
47
+ #!wget https://media.glamour.com/photos/5f171c4fd35176eaedb36823/master/w_2560%2Cc_limit/bike.jpg
48
+
49
+ import gradio as gr
50
+
51
+ inputs = [
52
+ gr.inputs.Image(type="pil", label="Original Image")
53
+ ]
54
+
55
+ outputs = [
56
+ gr.outputs.Textbox(label = 'Caption')
57
+ ]
58
+
59
+ title = "Image Captioning using ViT + GPT2"
60
+ description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training. This image captioning model might have some biases that we couldn't figure during our stress testing, so if you find any bias (gender, race and so on) please use `Flag` button to flag the image with bias"
61
+ article = " <a href='https://huggingface.co/sachin/vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
62
+ examples = [
63
+ ["people-walking-street-pedestrian-crossing-traffic-light-city.jpeg"],
64
+ ["elonmusk.jpeg"]
65
+
66
+ ]
67
+
68
+ gr.Interface(
69
+ vit2distilgpt2,
70
+ inputs,
71
+ outputs,
72
+ title=title,
73
+ description=description,
74
+ article=article,
75
+ examples=examples,
76
+ theme="huggingface",
77
+ ).launch(debug=True, enable_queue=True)
78
+
elonmusk.jpeg ADDED
people-walking-street-pedestrian-crossing-traffic-light-city.jpeg ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ Pillow