Shriharshan commited on
Commit
0273271
1 Parent(s): efb9c5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -21
app.py CHANGED
@@ -1,46 +1,38 @@
1
- # Image captioning with ViT+GPT2
2
  from PIL import Image
3
  from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
4
  import requests
 
5
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
6
- vit_feature_extactor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
7
  tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
8
 
9
-
10
- #url = 'https://d2gp644kobdlm6.cloudfront.net/wp-content/uploads/2016/06/bigstock-Shocked-and-surprised-boy-on-t-113798588-300x212.jpg'
11
- # with Image.open(requests.get(url, stream=True).raw) as img:
12
- # pixel_values = vit_feature_extactor(images=img, return_tensors="pt").pixel_values
13
- # encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams = 5)
14
- # generated_senetences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True,)
15
- # generated_senetences
16
- # generated_senetences[0].split(".")[0]
17
-
18
-
19
  def vit2distilgpt2(img):
20
- pixel_values = vit_feature_extactor(images=img, return_tensors="pt").pixel_values
21
- encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5)
22
- generated_senetences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
23
 
24
- return(generated_senetences[0].split('.')[0])
25
 
26
  import gradio as gr
 
27
  inputs = [
28
- gr.inputs.Image(type="pil",label="Original Images")
29
  ]
30
 
31
  outputs = [
32
- gr.outputs.Textbox(label = "Caption")
 
 
33
  ]
34
 
35
  title = "Image Captioning using ViT + GPT2"
36
- description = "ViT and GPT2 are used to generate Image Caption for the uploaded image.COCO DataSet is used for Training"
37
  examples = [
38
  ["Image1.png"],
39
  ["Image2.png"],
40
  ["Image3.png"]
41
  ]
42
 
43
-
44
  gr.Interface(
45
  vit2distilgpt2,
46
  inputs,
@@ -49,4 +41,4 @@ gr.Interface(
49
  description=description,
50
  examples=examples,
51
  theme="huggingface",
52
- ).launch(debug=True, enable_queue=True)
 
 
1
  from PIL import Image
2
  from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
3
  import requests
4
+
5
  model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
6
+ vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
7
  tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
8
 
 
 
 
 
 
 
 
 
 
 
9
  def vit2distilgpt2(img):
10
+ pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
11
+ encoder_outputs = model.generate(pixel_values.to('cpu'), num_beams=5, num_return_sequences=3)
12
+ generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
13
 
14
+ return generated_sentences
15
 
16
  import gradio as gr
17
+
18
  inputs = [
19
+ gr.inputs.Image(type="pil", label="Original Images")
20
  ]
21
 
22
  outputs = [
23
+ gr.outputs.Textbox(label="Caption 1"),
24
+ gr.outputs.Textbox(label="Caption 2"),
25
+ gr.outputs.Textbox(label="Caption 3")
26
  ]
27
 
28
  title = "Image Captioning using ViT + GPT2"
29
+ description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO DataSet is used for Training"
30
  examples = [
31
  ["Image1.png"],
32
  ["Image2.png"],
33
  ["Image3.png"]
34
  ]
35
 
 
36
  gr.Interface(
37
  vit2distilgpt2,
38
  inputs,
 
41
  description=description,
42
  examples=examples,
43
  theme="huggingface",
44
+ ).launch(debug=True, enable_queue=True)