Spaces:
Runtime error
Runtime error
# -*- coding: utf-8 -*- | |
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer | |
import torch | |
from PIL import Image | |
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
vit_feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") | |
def vit2distilgpt2(img): | |
pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values | |
encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5) | |
generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True) | |
return(generated_sentences[0].split('.')[0]) | |
import gradio as gr | |
inputs = [ | |
gr.inputs.Image(type="pil", label="Original Image") | |
] | |
outputs = [ | |
gr.outputs.Textbox(label = 'Caption') | |
] | |
title = "Visual Transformer using nlpconnect for Image to Text generation" | |
description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training." | |
article = " <a href='https://huggingface.co/nlpconnect/vit-gpt2-image-captioning'>Model Repo on Hugging Face Model Hub</a>" | |
examples = [ | |
["Img_1.jpg"], | |
["Img_2.jpg"], | |
["img_2t.jpg"], | |
["img_t2.jpg"], | |
["img4_t.jpg"] | |
] | |
gr.Interface( | |
vit2distilgpt2, | |
inputs, | |
outputs, | |
title=title, | |
description=description, | |
article=article, | |
examples=examples, | |
theme="huggingface", | |
).launch(debug=True, enable_queue=True) | |