from PIL import Image from transformers import VisionEncoderDecoderModel , ViTFeatureExtractor , PreTrainedTokenizerFast import gradio as gr model = VisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en") vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch32-224-in21k") tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2") def caption_images(image): pixel_values = vit_feature_extractor(images=image,return_tensors="pt").pixel_values encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams=5) generated_sentence = tokenizer.batch_decode(encoder_outputs,skip_special_tokens=True) return (generated_sentence[0].strip()) inputs = [ gr.components.Image(type='pil',label='Original Image') ] outputs = [ gr.components.Textbox(label='Caption') ] title = "Simple Image captioning Application" description = "Upload an image to see the caption generated" example =['messi.jpg'] gr.Interface( caption_images, inputs, outputs, title=title, description = description, examples = example, ).launch(debug=True)